We are collecting a dataset on water quality to train a machine learning model for binary classification: determining whether water is safe for consumption (1) or not (0). This model will help with water treatment decisions and ensure compliance with quality standards. We applied different summarization and plotting methods to help us to understand our dataset, such as scatter, histogram and bar plot. Then, we applyed preprocess in our data using data cleaning, data transformation and feature selection.
#library:
#install.packages("caret")
#install.packages("glmnet")
#install.packages("Boruta")
#install.packages("mlbench")
#install.packages("randomForest")
library(outliers)
library(dplyr)
library(mlbench)
library(caret)
library(glmnet)
library(Boruta)
library(ggplot2)
library(randomForest)
library(pROC)
library(e1071)
library(caret)
library(party)
library(partykit)
library(RWeka)
library(C50)
library(printr)
library(rpart)
library(rpart.plot)
getwd()
[1] "/Users/mahayie/Documents/GitHub/DM1Project"
#setwd("/Users/mahayie/Desktop/326p")
#getwd()
water_potability = read.csv('Dataset/water_potability.csv')
View(water_potability)
str(water_potability)
'data.frame': 3276 obs. of 10 variables:
$ ph : num NA 3.72 8.1 8.32 9.09 ...
$ Hardness : num 205 129 224 214 181 ...
$ Solids : num 20791 18630 19910 22018 17979 ...
$ Chloramines : num 7.3 6.64 9.28 8.06 6.55 ...
$ Sulfate : num 369 NA NA 357 310 ...
$ Conductivity : num 564 593 419 363 398 ...
$ Organic_carbon : num 10.4 15.2 16.9 18.4 11.6 ...
$ Trihalomethanes: num 87 56.3 66.4 100.3 32 ...
$ Turbidity : num 2.96 4.5 3.06 4.63 4.08 ...
$ Potability : int 0 0 0 0 0 0 0 0 0 0 ...
summary(water_potability)
ph Hardness Solids Chloramines Sulfate Conductivity
Min. : 0.000 Min. : 47.43 Min. : 320.9 Min. : 0.352 Min. :129.0 Min. :181.5
1st Qu.: 6.093 1st Qu.:176.85 1st Qu.:15666.7 1st Qu.: 6.127 1st Qu.:307.7 1st Qu.:365.7
Median : 7.037 Median :196.97 Median :20927.8 Median : 7.130 Median :333.1 Median :421.9
Mean : 7.081 Mean :196.37 Mean :22014.1 Mean : 7.122 Mean :333.8 Mean :426.2
3rd Qu.: 8.062 3rd Qu.:216.67 3rd Qu.:27332.8 3rd Qu.: 8.115 3rd Qu.:360.0 3rd Qu.:481.8
Max. :14.000 Max. :323.12 Max. :61227.2 Max. :13.127 Max. :481.0 Max. :753.3
NA's :491 NA's :781
Organic_carbon Trihalomethanes Turbidity Potability
Min. : 2.20 Min. : 0.738 Min. :1.450 Min. :0.0000
1st Qu.:12.07 1st Qu.: 55.845 1st Qu.:3.440 1st Qu.:0.0000
Median :14.22 Median : 66.622 Median :3.955 Median :0.0000
Mean :14.28 Mean : 66.396 Mean :3.967 Mean :0.3901
3rd Qu.:16.56 3rd Qu.: 77.337 3rd Qu.:4.500 3rd Qu.:1.0000
Max. :28.30 Max. :124.000 Max. :6.739 Max. :1.0000
NA's :162
Checking for missing values:
dim(water_potability)
[1] 3276 10
sum(is.na(water_potability))
[1] 1434
Remove rows with missing values
colSums(is.na(water_potability))
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon
491 0 0 0 781 0 0
Trihalomethanes Turbidity Potability
162 0 0
water_potability = na.omit(water_potability)
colSums(is.na(water_potability))
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon
0 0 0 0 0 0 0
Trihalomethanes Turbidity Potability
0 0 0
View(water_potability)
Description: The absence of data in certain variables or columns in a dataset is referred to as missing or null values due to various reasons. It can have a negative impact on the dataset’s efficiency and the information that can be taken from it later, so we checked to see whether our data had missing or null values and eliminated these rows to produce a more efficient dataset.
Standard deviation:
sd(water_potability$Turbidity)
[1] 0.7803462
sd(water_potability$Solids)
[1] 8642.24
sd(water_potability$Conductivity)
[1] 80.71257
sd(water_potability$Organic_carbon)
[1] 3.324959
sd(water_potability$ph)
[1] 1.573337
Mean:
mean(water_potability$Turbidity)
[1] 3.969729
mean(water_potability$Solids)
[1] 21917.44
mean(water_potability$Conductivity)
[1] 426.5264
mean(water_potability$Organic_carbon)
[1] 14.35771
mean(water_potability$ph)
[1] 7.08599
Median
median(water_potability$Turbidity)
[1] 3.968177
median(water_potability$Solids)
[1] 20933.51
median(water_potability$Conductivity)
[1] 423.4559
median(water_potability$Organic_carbon)
[1] 14.32202
median(water_potability$ph)
[1] 7.027297
Variance
var(water_potability$Turbidity)
[1] 0.6089401
var(water_potability$Solids)
[1] 74688309
var(water_potability$Conductivity)
[1] 6514.519
var(water_potability$Organic_carbon)
[1] 11.05535
var(water_potability$ph)
[1] 2.475388
Statistical Measures:
summary(water_potability$Conductivity)
Min. 1st Qu. Median Mean 3rd Qu. Max.
201.6 366.7 423.5 426.5 482.4 753.3
summary(water_potability$Organic_carbon)
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.20 12.12 14.32 14.36 16.68 27.01
summary(water_potability$Hardness)
Min. 1st Qu. Median Mean 3rd Qu. Max.
73.49 176.74 197.19 195.97 216.44 317.34
summary(water_potability$Solids)
Min. 1st Qu. Median Mean 3rd Qu. Max.
320.9 15615.7 20933.5 21917.4 27182.6 56488.7
summary(water_potability$Chloramines)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.391 6.139 7.144 7.134 8.110 13.127
summary(water_potability$Potability)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0000 0.0000 0.0000 0.4033 1.0000 1.0000
summary(water_potability$Sulfate)
Min. 1st Qu. Median Mean 3rd Qu. Max.
129.0 307.6 332.2 333.2 359.3 481.0
summary(water_potability$Trihalomethanes)
Min. 1st Qu. Median Mean 3rd Qu. Max.
8.577 55.953 66.542 66.401 77.292 124.000
summary(water_potability$Turbidity)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.450 3.443 3.968 3.970 4.514 6.495
summary(water_potability$ph)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.2275 6.0897 7.0273 7.0860 8.0530 14.0000
Descriotion: With using minimum, maximum, mean, median laws it helps to provide an overview of the data’s key characteristics
Data Transformation:
water_potability$Potability[water_potability$Potability == '0'] <- 'Not Potable'
water_potability$Potability[water_potability$Potability == '1'] <- 'Potable'
water_potability$Potability <- as.factor(water_potability$Potability)
table(water_potability$Potability)
Not Potable Potable
1200 811
print(water_potability)
Description: This step involved transforming the class label, Potability, into categorical data. We changed the numeric data to ‘Not Potable’ and ‘Potable’ to indicate whether the water is safe for human consumption, where 1 represents ‘Potable’, and 0 represents ‘Not Potable’.
Outliers before removing outlier:
dim(water_potability)
[1] 2011 10
head(water_potability)
removing outliers:
summary(water_potability$ph)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.2275 6.0897 7.0273 7.0860 8.0530 14.0000
quartiles <- quantile(water_potability$ph, probs = c(.25, .75), na.rm = FALSE)
quartiles
25% 75%
6.089723 8.052969
iqr <- IQR(water_potability$ph)
iqr
[1] 1.963245
lower <- quartiles[1] - 1.5*iqr
lower
25%
3.144855
upper <- quartiles[2] + 1.5*iqr
upper
75%
10.99784
boxplot(ph ~ Potability, data = water_potability)
repeat {
out_val <- boxplot(water_potability$ph, ylab = 'ph')$out
out_val
out_rows <- which(water_potability$ph %in% c(out_val))
out_rows
if(sum(out_rows) > 0) water_potability <- water_potability[-out_rows,]
else {break}
}
summary(water_potability$ph)
Min. 1st Qu. Median Mean 3rd Qu. Max.
3.231 6.105 7.027 7.087 8.030 10.905
#-------------------------------------------
-Hardness
summary(water_potability$Hardness)
Min. 1st Qu. Median Mean 3rd Qu. Max.
73.49 176.90 197.36 196.27 216.44 317.34
quartiles <- quantile(water_potability$Hardness, probs = c(.25, .75), na.rm = FALSE)
quartiles
25% 75%
176.9031 216.4411
iqr <- IQR(water_potability$Hardness)
iqr
[1] 39.53799
lower <- quartiles[1] - 1.5*iqr
lower
25%
117.5961
upper <- quartiles[2] + 1.5*iqr
upper
75%
275.7481
boxplot(Hardness ~ Potability, data = water_potability)
repeat {
out_val <- boxplot(water_potability$Hardness, ylab = 'Hardness')$out
out_val
out_rows <- which(water_potability$Hardness %in% c(out_val))
out_rows
if(sum(out_rows) > 0) water_potability <- water_potability[-out_rows,]
else {break}
}
summary(water_potability$Hardness)
Min. 1st Qu. Median Mean 3rd Qu. Max.
121.0 177.7 197.3 196.2 215.5 272.1
#-------------------------------------------
-Solids
summary(water_potability$Solids)
Min. 1st Qu. Median Mean 3rd Qu. Max.
320.9 15704.5 20855.3 21840.2 27045.9 56488.7
quartiles <- quantile(water_potability$Solids, probs = c(.25, .75), na.rm = FALSE)
quartiles
25% 75%
15704.48 27045.93
iqr <- IQR(water_potability$Solids)
iqr
[1] 11341.45
lower <- quartiles[1] - 1.5*iqr
lower
25%
-1307.69
upper <- quartiles[2] + 1.5*iqr
upper
75%
44058.1
boxplot(Solids ~ Potability, data = water_potability)
repeat {
out_val <- boxplot(water_potability$Solids, ylab = 'Solids')$out
out_val
out_rows <- which(water_potability$Solids %in% c(out_val))
out_rows
if(sum(out_rows) > 0) water_potability <- water_potability[-out_rows,]
else {break}
}
summary(water_potability$Solids)
Min. 1st Qu. Median Mean 3rd Qu. Max.
320.9 15547.5 20518.7 21419.6 26734.7 43195.5
#-------------------------------------------
-Chloramines
summary(water_potability$Chloramines)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.391 6.141 7.135 7.135 8.094 13.127
quartiles <- quantile(water_potability$Chloramines, probs = c(.25, .75), na.rm = FALSE)
quartiles
25% 75%
6.141236 8.094323
iqr <- IQR(water_potability$Chloramines)
iqr
[1] 1.953087
lower <- quartiles[1] - 1.5*iqr
lower
25%
3.211605
upper <- quartiles[2] + 1.5*iqr
upper
75%
11.02395
boxplot(Chloramines ~ Potability, data = water_potability)
repeat {
out_val <- boxplot(water_potability$Chloramines, ylab = 'Chloramines')$out
out_val
out_rows <- which(water_potability$Chloramines %in% c(out_val))
out_rows
if(sum(out_rows) > 0) water_potability <- water_potability[-out_rows,]
else {break}
}
summary(water_potability$Chloramines)
Min. 1st Qu. Median Mean 3rd Qu. Max.
3.352 6.181 7.137 7.136 8.076 10.897
#-------------------------------------------
-Sulfate
summary(water_potability$Sulfate)
Min. 1st Qu. Median Mean 3rd Qu. Max.
187.2 308.2 332.6 333.4 358.3 481.0
quartiles <- quantile(water_potability$Sulfate, probs = c(.25, .75), na.rm = FALSE)
quartiles
25% 75%
308.1884 358.3020
iqr <- IQR(water_potability$Sulfate)
iqr
[1] 50.11358
lower <- quartiles[1] - 1.5*iqr
lower
25%
233.0181
upper <- quartiles[2] + 1.5*iqr
upper
75%
433.4724
boxplot(Sulfate ~ Potability, data = water_potability)
repeat {
out_val <- boxplot(water_potability$Sulfate, ylab = 'Sulfate')$out
out_val
out_rows <- which(water_potability$Sulfate %in% c(out_val))
out_rows
if(sum(out_rows) > 0) water_potability <- water_potability[-out_rows,]
else {break}
}
summary(water_potability$Sulfate)
Min. 1st Qu. Median Mean 3rd Qu. Max.
237.5 309.2 332.8 333.6 357.7 429.8
#-------------------------------------------
-Conductivity
summary(water_potability$Conductivity)
Min. 1st Qu. Median Mean 3rd Qu. Max.
201.6 366.6 423.6 426.8 482.6 753.3
quartiles <- quantile(water_potability$Conductivity, probs = c(.25, .75), na.rm = FALSE)
quartiles
25% 75%
366.5581 482.5983
iqr <- IQR(water_potability$Conductivity)
iqr
[1] 116.0401
lower <- quartiles[1] - 1.5*iqr
lower
25%
192.4979
upper <- quartiles[2] + 1.5*iqr
upper
75%
656.6585
boxplot(Conductivity ~ Potability, data = water_potability)
repeat {
out_val <- boxplot(water_potability$Conductivity, ylab = 'Conductivity')$out
out_val
out_rows <- which(water_potability$Conductivity %in% c(out_val))
out_rows
if(sum(out_rows) > 0) water_potability <- water_potability[-out_rows,]
else {break}
}
summary(water_potability$Conductivity)
Min. 1st Qu. Median Mean 3rd Qu. Max.
201.6 366.4 423.1 426.0 481.9 652.5
#-------------------------------------------
-Organic_carbon
summary(water_potability$Organic_carbon)
Min. 1st Qu. Median Mean 3rd Qu. Max.
4.372 12.184 14.351 14.417 16.788 27.007
quartiles <- quantile(water_potability$Organic_carbon, probs = c(.25, .75), na.rm = FALSE)
quartiles
25% 75%
12.18447 16.78779
iqr <- IQR(water_potability$Organic_carbon)
iqr
[1] 4.603315
lower <- quartiles[1] - 1.5*iqr
lower
25%
5.279502
upper <- quartiles[2] + 1.5*iqr
upper
75%
23.69276
boxplot(Organic_carbon ~ Potability, data = water_potability)
repeat {
out_val <- boxplot(water_potability$Organic_carbon, ylab = 'Organic_carbon')$out
out_val
out_rows <- which(water_potability$Organic_carbon %in% c(out_val))
out_rows
if(sum(out_rows) > 0) water_potability <- water_potability[-out_rows,]
else {break}
}
summary(water_potability$Organic_carbon)
Min. 1st Qu. Median Mean 3rd Qu. Max.
5.512 12.222 14.352 14.426 16.786 23.604
#-------------------------------------------
-Trihalomethanes
summary(water_potability$Trihalomethanes)
Min. 1st Qu. Median Mean 3rd Qu. Max.
8.577 55.865 66.231 66.364 77.418 124.000
quartiles <- quantile(water_potability$Trihalomethanes, probs = c(.25, .75), na.rm = FALSE)
quartiles
25% 75%
55.86494 77.41789
iqr <- IQR(water_potability$Trihalomethanes)
iqr
[1] 21.55295
lower <- quartiles[1] - 1.5*iqr
lower
25%
23.53552
upper <- quartiles[2] + 1.5*iqr
upper
75%
109.7473
boxplot(Trihalomethanes ~ Potability, data = water_potability)
repeat {
out_val <- boxplot(water_potability$Trihalomethanes, ylab = 'Trihalomethanes')$out
out_val
out_rows <- which(water_potability$Trihalomethanes %in% c(out_val))
out_rows
if(sum(out_rows) > 0) water_potability <- water_potability[-out_rows,]
else {break}
}
summary(water_potability$Trihalomethanes)
Min. 1st Qu. Median Mean 3rd Qu. Max.
24.53 55.96 66.29 66.42 77.34 108.85
#-------------------------------------------
-Turbidity
summary(water_potability$Turbidity)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.450 3.441 3.975 3.973 4.519 6.495
quartiles <- quantile(water_potability$Turbidity, probs = c(.25, .75), na.rm = FALSE)
quartiles
25% 75%
3.440859 4.518751
iqr <- IQR(water_potability$Turbidity)
iqr
[1] 1.077892
lower <- quartiles[1] - 1.5*iqr
lower
25%
1.824021
upper <- quartiles[2] + 1.5*iqr
upper
75%
6.135588
boxplot(Turbidity ~ Potability, data = water_potability)
repeat {
out_val <- boxplot(water_potability$Turbidity, ylab = 'Turbidity')$out
out_val
out_rows <- which(water_potability$Turbidity %in% c(out_val))
out_rows
if(sum(out_rows) > 0) water_potability <- water_potability[-out_rows,]
else {break}
}
summary(water_potability$Turbidity)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.873 3.443 3.974 3.972 4.512 6.084
After removing outliers:
dim(water_potability)
[1] 1750 10
summary(water_potability)
ph Hardness Solids Chloramines Sulfate Conductivity
Min. : 3.388 Min. :121.0 Min. : 320.9 Min. : 3.352 Min. :237.5 Min. :201.6
1st Qu.: 6.125 1st Qu.:177.8 1st Qu.:15465.4 1st Qu.: 6.189 1st Qu.:309.7 1st Qu.:366.4
Median : 7.026 Median :197.3 Median :20468.8 Median : 7.135 Median :333.0 Median :422.4
Mean : 7.075 Mean :196.0 Mean :21362.1 Mean : 7.131 Mean :333.9 Mean :426.0
3rd Qu.: 7.986 3rd Qu.:215.0 3rd Qu.:26588.0 3rd Qu.: 8.062 3rd Qu.:357.8 3rd Qu.:482.2
Max. :10.905 Max. :272.1 Max. :43195.5 Max. :10.897 Max. :429.8 Max. :652.5
Organic_carbon Trihalomethanes Turbidity Potability
Min. : 5.512 Min. : 24.53 Min. :1.873 Not Potable:1066
1st Qu.:12.233 1st Qu.: 55.96 1st Qu.:3.443 Potable : 684
Median :14.353 Median : 66.33 Median :3.974
Mean :14.434 Mean : 66.42 Mean :3.972
3rd Qu.:16.797 3rd Qu.: 77.34 3rd Qu.:4.512
Max. :23.604 Max. :108.85 Max. :6.084
str(water_potability)
'data.frame': 1750 obs. of 10 variables:
$ ph : num 8.32 9.09 5.58 10.22 8.64 ...
$ Hardness : num 214 181 188 248 203 ...
$ Solids : num 22018 17979 28749 28750 13672 ...
$ Chloramines : num 8.06 6.55 7.54 7.51 4.56 ...
$ Sulfate : num 357 310 327 394 303 ...
$ Conductivity : num 363 398 280 284 475 ...
$ Organic_carbon : num 18.4 11.6 8.4 13.8 12.4 ...
$ Trihalomethanes: num 100.3 32 54.9 84.6 62.8 ...
$ Turbidity : num 4.63 4.08 2.56 2.67 4.4 ...
$ Potability : Factor w/ 2 levels "Not Potable",..: 1 1 1 1 1 1 1 1 1 1 ...
- attr(*, "na.action")= 'omit' Named int [1:1265] 1 2 3 9 12 14 15 17 19 21 ...
..- attr(*, "names")= chr [1:1265] "1" "2" "3" "9" ...
head(water_potability)
Description: Removing outliers from a dataset is critical for assuring the quality and reliability of statistical analysis and machine learning models. To produce a more accurate dataset that would help obtain more precise results later, we took the following steps to handle outliers in the numeric attributes. Firstly, we identified all the outliers. Secondly, we deleted the rows containing the outliers. Finally, we conducted a second check to ensure all outliers were deleted. Any new outliers due to the IQR change after deleting rows in the second step were also eliminated.
Charts
Histogram
hist(water_potability$ph)
hist(water_potability$Chloramines)
hist(water_potability$Hardness)
hist(water_potability$Solids)
hist(water_potability$Sulfate)
hist(water_potability$Conductivity)
hist(water_potability$Organic_carbon)
hist(water_potability$Trihalomethanes)
hist(water_potability$Turbidity)
Bar Plot
tab <- water_potability$Potability %>% table()
txt <- paste0(tab)
bb <- water_potability$ph %>% table() %>% barplot( main='ph',col=c('pink'))
bb <- water_potability$Potability %>% table() %>% barplot( main='Potability',ylab='Frequency',col=c('pink', 'lightblue'))
text(bb, tab/2, labels=txt, cex=1)
Pie chart
water_potability$Potability %>% table() %>% pie()
Scatter Plot
with(water_potability, plot(Turbidity, ph, col = Potability, pch = as.numeric(Potability)))
Description: -Histogram: The histogram shows the frequency of ph in the dataset; we noted that the majority of values fall within the usual range, which is about between 6 and 8, but it also shows several outliers. -Scatter plot: This scatter demonstrates the correlation and proportionality between the two qualities, allowing us to establish whether or not turbidity and pH are connected. -Bar Plot the bar plot represent how ph levels affect water portability in the dataset it indicates that ph level above 10 is not portibal and humans cant consume it
Remove Redundant Features:
correlation_matrix <- cor(water_potability[,1:9])
high_correlation_features <- findCorrelation(correlation_matrix, cutoff = 0.5)
print(high_correlation_features)
integer(0)
heatmap(correlation_matrix)
Description: This will find the correlation between the features and represent it in heat map
Feature selection
Rank Features By Importance:
#train random forest model and calculate feature importance
rf = randomForest(x= water_potability[,1:9],y= water_potability[,10])
var_imp <- varImp(rf, scale = FALSE)
#sort the score in decreasing order
var_imp_df <- data.frame(cbind(variable = rownames(var_imp), score = var_imp[,1]))
var_imp_df$score <- as.double(var_imp_df$score)
var_imp_df[order(var_imp_df$score,decreasing = TRUE),]
ggplot(var_imp_df, aes(x=reorder(variable, score), y=score)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=score)) +
ylab("IncNodePurity") +
xlab("Variable Name") +
coord_flip()
Recursive Feature elimination:
control <- rfeControl(functions=rfFuncs, method="cv",number=10)
rf <- trainControl(method = "cv", number = 10, verboseIter = FALSE)
# run the RFE algorithm
rfe_model <- rfe(x= water_potability[,1:9],y= water_potability[,10], sizes=c(1:9), rfeControl=control)
# summarize the results
print(rfe_model)
Recursive feature selection
Outer resampling method: Cross-Validated (10 fold)
Resampling performance over subset size:
The top 5 variables (out of 6):
Sulfate, ph, Hardness, Solids, Chloramines
# list the chosen features
predictors(rfe_model)
[1] "Sulfate" "ph" "Hardness" "Solids" "Chloramines" "Trihalomethanes"
# plot the results
plot(rfe_model, type=c("g", "o"))
Description: Ranking features by importance is a technique used to identify the most influential variables in a dataset for predicting a target variable. This process helps understand which features impact the model’s performance most by ranking features by importance.
Removing redundant features means eliminating variables or features from a dataset that do not provide additional or unique information.
Data transformation
Discretization:
wp<- water_potability
wp$ph= cut(wp$ph, breaks = seq(3,11,by=4),right=FALSE)
wp$Hardness= cut(wp$Hardness, breaks = seq(120,280,by=40),right=FALSE)
wp$Chloramines = cut(wp$Chloramines, breaks = seq(3,11,by=4),right = FALSE)
wp$Sulfate= cut(wp$Sulfate, breaks = seq(220,440,by=44),right=FALSE)
wp$Conductivity= cut(wp$Conductivity, breaks = seq(200,700,by=100),right=FALSE)
wp$Organic_carbon= cut(wp$Organic_carbon, breaks = seq(4,24,by=4),right=FALSE)
wp$Trihalomethanes= cut(wp$Trihalomethanes, breaks = seq(20,110,by=10),right=FALSE)
wp$Turbidity= cut(wp$Turbidity, breaks = seq(1,7,by=2),right=FALSE)
print(wp)
Description: Discretization is the process of transforming continuous variables into discrete or categorical variables. It can be useful for analyzing data with many unique values or simplifying it. Therefore, we transformed the continuous values of the numeric attributes into intervals by dividing the values to fall on one of the possible interval labels by discretization. The values will be meaningful and simpler to classify or perform other methods to help us later in our model. So, In Trihalomethanes, we intervals by dividing the values by 10 to have labels with equal width : [20,30) [30,40) [40,50) [50,60) [60,70) [70,80) [80,90) [90,100) [100,110).
Normlization
normalize=function(x){return ((x-min(x))/(max(x)))}
wp$Solids=normalize(wp$Solids)
Description: Normalization refers to the process of scaling variables to have a common range. It helps in comparing variables with different scales. The solids attribute will create critical challenges because of the vast and diverted values: min is 320.9, and max is 43195.5, so we normalized the solids between 0 and 1 to make values smaller and more reasonable.
Encoding
Description: Encoding is converting characters or strings into a specific encoding format. We could not implement it since our database does not have a Nominal attribute.
wp
Information gain (ID3) Splitting the data set into two subsets: Training(70%) and Testing(30%):
set.seed(1958)
ind <- sample(2, nrow(wp), replace = TRUE, prob = c(0.7, 0.3))
train.data <- wp[ind == 1, ]
test.data <- wp[ind == 2, ]
train.data$Potability <- as.factor(train.data$Potability)
test.data$Potability <- as.factor(test.data$Potability)
myFormula <- Potability ~ ph+Hardness+Solids+Chloramines+Sulfate+Conductivity+Organic_carbon+Trihalomethanes+Turbidity
#myFormula <- Potability ~ ph+Hardness+Solids+Chloramines+Sulfate
m.ctree <- ctree(myFormula, data = train.data)
table(predict(m.ctree), train.data$Potability)
Not Potable Potable
Not Potable 720 412
Potable 35 56
print(m.ctree)
Model formula:
Potability ~ ph + Hardness + Solids + Chloramines + Sulfate +
Conductivity + Organic_carbon + Trihalomethanes + Turbidity
Fitted party:
[1] root
| [2] Sulfate in [220,264), [396,440): Potable (n = 91, err = 38.5%)
| [3] Sulfate in [264,308), [308,352), [352,396): Not Potable (n = 1132, err = 36.4%)
Number of inner nodes: 1
Number of terminal nodes: 2
plot(m.ctree, type="simple")
testPred <- predict(m.ctree, newdata = test.data)
result<-table(testPred, test.data$Potability)
co_result <- confusionMatrix(result)
print(co_result)
Confusion Matrix and Statistics
testPred Not Potable Potable
Not Potable 293 200
Potable 18 16
Accuracy : 0.5863
95% CI : (0.543, 0.6287)
No Information Rate : 0.5901
P-Value [Acc > NIR] : 0.5886
Kappa : 0.0186
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.94212
Specificity : 0.07407
Pos Pred Value : 0.59432
Neg Pred Value : 0.47059
Prevalence : 0.59013
Detection Rate : 0.55598
Detection Prevalence : 0.93548
Balanced Accuracy : 0.50810
'Positive' Class : Not Potable
as.matrix(co_result, what = "classes")
[,1]
Sensitivity 0.94212219
Specificity 0.07407407
Pos Pred Value 0.59432049
Neg Pred Value 0.47058824
Precision 0.59432049
Recall 0.94212219
F1 0.72885572
Prevalence 0.59013283
Detection Rate 0.55597723
Detection Prevalence 0.93548387
Balanced Accuracy 0.50809813
acc <- co_result$overall["Accuracy"]
acc*100
Accuracy
58.63378
pred_probs <- as.numeric(predict(m.ctree, newdata = test.data, type = "response"))
binary_outcome <- as.numeric(test.data$Potability == "Potable")
# ROC curve
roc_curve <- roc(binary_outcome, pred_probs)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
plot(roc_curve, main = "ROC Curve", col = "blue", lwd = 2)
abline(a = 0, b = 1, col = "gray", lty = 2)
# Print AUC
cat("AUC:", auc(roc_curve), "\n")
AUC: 0.5080981
Splitting the data set into two subsets: Training(80%) and Testing(20%):
set.seed(1958)
ind <- sample(2, nrow(wp), replace = TRUE, prob = c(0.8, 0.2))
train.data <- wp[ind == 1, ]
test.data <- wp[ind == 2, ]
train.data$Potability <- as.factor(train.data$Potability)
myFormula <- Potability ~ ph+Hardness+Solids+Chloramines+Sulfate+Conductivity+Organic_carbon+Trihalomethanes+Turbidity
#myFormula <- Potability ~ ph+Hardness+Solids+Chloramines+Sulfate
m.ctree <- ctree(myFormula, data = train.data)
table(predict(m.ctree), train.data$Potability)
Not Potable Potable
Not Potable 854 472
Potable 12 55
print(m.ctree)
Model formula:
Potability ~ ph + Hardness + Solids + Chloramines + Sulfate +
Conductivity + Organic_carbon + Trihalomethanes + Turbidity
Fitted party:
[1] root
| [2] Sulfate in [220,264), [264,308), [352,396), [396,440)
| | [3] Solids <= 0.64014: Not Potable (n = 598, err = 40.1%)
| | [4] Solids > 0.64014
| | | [5] Sulfate in [220,264), [264,308)
| | | | [6] ph in [3,7)
| | | | | [7] Chloramines in [3,7): Potable (n = 17, err = 23.5%)
| | | | | [8] Chloramines in [7,11): Not Potable (n = 15, err = 20.0%)
| | | | [9] ph in [7,11): Potable (n = 50, err = 16.0%)
| | | [10] Sulfate in [352,396), [396,440): Not Potable (n = 62, err = 38.7%)
| [11] Sulfate in [308,352): Not Potable (n = 651, err = 31.5%)
Number of inner nodes: 5
Number of terminal nodes: 6
plot(m.ctree, type="simple")
testPred <- predict(m.ctree, newdata = test.data)
result<-table(testPred, test.data$Potability)
co_result <- confusionMatrix(result)
print(co_result)
Confusion Matrix and Statistics
testPred Not Potable Potable
Not Potable 192 144
Potable 8 13
Accuracy : 0.5742
95% CI : (0.5211, 0.6261)
No Information Rate : 0.5602
P-Value [Acc > NIR] : 0.3163
Kappa : 0.0472
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.9600
Specificity : 0.0828
Pos Pred Value : 0.5714
Neg Pred Value : 0.6190
Prevalence : 0.5602
Detection Rate : 0.5378
Detection Prevalence : 0.9412
Balanced Accuracy : 0.5214
'Positive' Class : Not Potable
as.matrix(co_result, what = "classes")
[,1]
Sensitivity 0.96000000
Specificity 0.08280255
Pos Pred Value 0.57142857
Neg Pred Value 0.61904762
Precision 0.57142857
Recall 0.96000000
F1 0.71641791
Prevalence 0.56022409
Detection Rate 0.53781513
Detection Prevalence 0.94117647
Balanced Accuracy 0.52140127
acc <- co_result$overall["Accuracy"]
acc*100
Accuracy
57.42297
pred_probs <- as.numeric(predict(m.ctree, newdata = test.data, type = "response"))
binary_outcome <- as.numeric(test.data$Potability == "Potable")
# ROC curve
roc_curve <- roc(binary_outcome, pred_probs)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
plot(roc_curve, main = "ROC Curve", col = "blue", lwd = 2)
abline(a = 0, b = 1, col = "gray", lty = 2)
# Print AUC
cat("AUC:", auc(roc_curve), "\n")
AUC: 0.5214013
Splitting the data set into two subsets: Training(90%) and Testing(10%):
set.seed(1958)
ind <- sample(2, nrow(wp), replace = TRUE, prob = c(0.9, 0.1))
train.data <- wp[ind == 1, ]
test.data <- wp[ind == 2, ]
train.data$Potability <- as.factor(train.data$Potability)
myFormula <- Potability ~ ph+Hardness+Solids+Chloramines+Sulfate+Conductivity+Organic_carbon+Trihalomethanes+Turbidity
#myFormula <- Potability ~ ph+Hardness+Solids+Chloramines+Sulfate
m.ctree <- ctree(myFormula, data = train.data)
table(predict(m.ctree), train.data$Potability)
Not Potable Potable
Not Potable 904 501
Potable 58 110
print(m.ctree)
Model formula:
Potability ~ ph + Hardness + Solids + Chloramines + Sulfate +
Conductivity + Organic_carbon + Trihalomethanes + Turbidity
Fitted party:
[1] root
| [2] Sulfate in [220,264), [264,308), [352,396), [396,440)
| | [3] Solids <= 0.66526
| | | [4] Sulfate in [220,264), [396,440): Potable (n = 86, err = 43.0%)
| | | [5] Sulfate in [264,308), [352,396): Not Potable (n = 618, err = 39.6%)
| | [6] Solids > 0.66526
| | | [7] Sulfate in [220,264), [264,308): Potable (n = 82, err = 25.6%)
| | | [8] Sulfate in [352,396), [396,440): Not Potable (n = 54, err = 37.0%)
| [9] Sulfate in [308,352): Not Potable (n = 733, err = 32.2%)
Number of inner nodes: 4
Number of terminal nodes: 5
plot(m.ctree, type="simple")
testPred <- predict(m.ctree, newdata = test.data)
result<-table(testPred, test.data$Potability)
co_result <- confusionMatrix(result)
print(co_result)
Confusion Matrix and Statistics
testPred Not Potable Potable
Not Potable 91 64
Potable 13 9
Accuracy : 0.565
95% CI : (0.4885, 0.6392)
No Information Rate : 0.5876
P-Value [Acc > NIR] : 0.7547
Kappa : -0.0019
Mcnemar's Test P-Value : 1.212e-08
Sensitivity : 0.8750
Specificity : 0.1233
Pos Pred Value : 0.5871
Neg Pred Value : 0.4091
Prevalence : 0.5876
Detection Rate : 0.5141
Detection Prevalence : 0.8757
Balanced Accuracy : 0.4991
'Positive' Class : Not Potable
as.matrix(co_result, what = "classes")
[,1]
Sensitivity 0.8750000
Specificity 0.1232877
Pos Pred Value 0.5870968
Neg Pred Value 0.4090909
Precision 0.5870968
Recall 0.8750000
F1 0.7027027
Prevalence 0.5875706
Detection Rate 0.5141243
Detection Prevalence 0.8757062
Balanced Accuracy 0.4991438
acc <- co_result$overall["Accuracy"]
acc*100
Accuracy
56.49718
pred_probs <- as.numeric(predict(m.ctree, newdata = test.data, type = "response"))
binary_outcome <- as.numeric(test.data$Potability == "Potable")
# ROC curve
roc_curve <- roc(binary_outcome, pred_probs)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
plot(roc_curve, main = "ROC Curve", col = "blue", lwd = 2)
abline(a = 0, b = 1, col = "gray", lty = 2)
# Print AUC
cat("AUC:", auc(roc_curve), "\n")
AUC: 0.4991438
Gain ratio (C4.5)
# 3 folds
set.seed(1958)
train <- createFolds(wp$Potability, k=3)
C45Fit <- train(Potability ~ .,method = "J48",data = wp,
trControl = trainControl(
method = "cv",
index = train,
savePredictions = TRUE))
C45Fit
C4.5-like Trees
1750 samples
9 predictor
2 classes: 'Not Potable', 'Potable'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 584, 583, 583
Resampling results across tuning parameters:
C M Accuracy Kappa
0.010 1 0.6140010 0.02242533
0.010 2 0.6080020 0.05080768
0.010 3 0.6148589 0.02353259
0.255 1 0.5865761 0.09696359
0.255 2 0.5800048 0.08645875
0.255 3 0.5842866 0.08011865
0.500 1 0.5548589 0.06529168
0.500 2 0.5622907 0.07454861
0.500 3 0.5668601 0.07437164
Accuracy was used to select the optimal model using the largest value.
The final values used for the model were C = 0.01 and M = 3.
C45Fit$finalModel
J48 pruned tree
------------------
Sulfate[396,440) <= 0: Not Potable (1670.0/640.0)
Sulfate[396,440) > 0
| ph[7,11) <= 0: Potable (42.0/10.0)
| ph[7,11) > 0: Not Potable (38.0/12.0)
Number of Leaves : 3
Size of the tree : 5
pred_probs <- predict(C45Fit, newdata = wp, type = "prob")[, "Potable"]
binary_outcome <- as.numeric(wp$Potability == "Potable")
# ROC curve
roc_curve <- roc(binary_outcome, pred_probs)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
plot(roc_curve, main = "ROC Curve", col = "blue", lwd = 2)
abline(a = 0, b = 1, col = "gray", lty = 2)
# Print AUC
cat("AUC:", auc(roc_curve), "\n")
AUC: 0.5216363
# 5 folds
set.seed(1958)
train <- createFolds(wp$Potability, k=5)
C45Fit <- train(Potability ~., method="J48", data=wp,
trControl = trainControl(
method ="cv",
index = train,
savePredictions = TRUE))
C45Fit
C4.5-like Trees
1750 samples
9 predictor
2 classes: 'Not Potable', 'Potable'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 350, 350, 350, 350, 350
Resampling results across tuning parameters:
C M Accuracy Kappa
0.010 1 0.6067143 0.003029794
0.010 2 0.6062857 0.002195738
0.010 3 0.6060000 0.001633667
0.255 1 0.5515714 0.030018055
0.255 2 0.5642857 0.053615393
0.255 3 0.5667143 0.043028830
0.500 1 0.5450000 0.036774387
0.500 2 0.5545714 0.051996482
0.500 3 0.5527143 0.030952302
Accuracy was used to select the optimal model using the largest value.
The final values used for the model were C = 0.01 and M = 1.
C45Fit$finalModel
J48 pruned tree
------------------
Sulfate[396,440) <= 0: Not Potable (1670.0/640.0)
Sulfate[396,440) > 0
| ph[7,11) <= 0: Potable (42.0/10.0)
| ph[7,11) > 0: Not Potable (38.0/12.0)
Number of Leaves : 3
Size of the tree : 5
pred_probs <- predict(C45Fit, newdata = wp, type = "prob")[, "Potable"]
binary_outcome <- as.numeric(wp$Potability == "Potable")
# ROC curve
roc_curve <- roc(binary_outcome, pred_probs)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
plot(roc_curve, main = "ROC Curve", col = "blue", lwd = 2)
abline(a = 0, b = 1, col = "gray", lty = 2)
# Print AUC
cat("AUC:", auc(roc_curve), "\n")
AUC: 0.5216363
# 10 folds
set.seed(1958)
train <- createFolds(wp$Potability, k=10)
C45Fit <- train(Potability ~., method="J48", data=wp,
trControl = trainControl(
method="cv", indexOut=train))
C45Fit
C4.5-like Trees
1750 samples
9 predictor
2 classes: 'Not Potable', 'Potable'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 1576, 1574, 1575, 1574, 1574, 1575, ...
Resampling results across tuning parameters:
C M Accuracy Kappa
0.010 1 0.6331355 0.08610946
0.010 2 0.6371355 0.10006932
0.010 3 0.6359927 0.09906518
0.255 1 0.8160390 0.59246265
0.255 2 0.7834603 0.51920580
0.255 3 0.7611579 0.47221105
0.500 1 0.8908616 0.76673013
0.500 2 0.8320196 0.63983987
0.500 3 0.8006036 0.57209792
Accuracy was used to select the optimal model using the largest value.
The final values used for the model were C = 0.5 and M = 1.
C45Fit$finalModel
J48 pruned tree
------------------
Sulfate[396,440) <= 0
| Sulfate[308,352) <= 0
| | Trihalomethanes[40,50) <= 0
| | | Organic_carbon[20,24) <= 0
| | | | ph[7,11) <= 0
| | | | | Sulfate[352,396) <= 0
| | | | | | Trihalomethanes[70,80) <= 0
| | | | | | | Turbidity[3,5) <= 0
| | | | | | | | Trihalomethanes[30,40) <= 0
| | | | | | | | | Trihalomethanes[90,100) <= 0
| | | | | | | | | | Chloramines[7,11) <= 0
| | | | | | | | | | | Organic_carbon[16,20) <= 0
| | | | | | | | | | | | Solids <= 0.397137: Not Potable (2.0)
| | | | | | | | | | | | Solids > 0.397137: Potable (5.0)
| | | | | | | | | | | Organic_carbon[16,20) > 0: Not Potable (2.0)
| | | | | | | | | | Chloramines[7,11) > 0: Not Potable (14.0/1.0)
| | | | | | | | | Trihalomethanes[90,100) > 0
| | | | | | | | | | Chloramines[7,11) <= 0: Not Potable (1.0)
| | | | | | | | | | Chloramines[7,11) > 0: Potable (3.0)
| | | | | | | | Trihalomethanes[30,40) > 0: Potable (2.0)
| | | | | | | Turbidity[3,5) > 0
| | | | | | | | Trihalomethanes[80,90) <= 0
| | | | | | | | | Hardness[240,280) <= 0
| | | | | | | | | | Sulfate[264,308) <= 0
| | | | | | | | | | | Hardness[160,200) <= 0
| | | | | | | | | | | | Trihalomethanes[30,40) <= 0
| | | | | | | | | | | | | Organic_carbon[12,16) <= 0: Potable (3.0)
| | | | | | | | | | | | | Organic_carbon[12,16) > 0
| | | | | | | | | | | | | | Trihalomethanes[50,60) <= 0: Not Potable (2.0)
| | | | | | | | | | | | | | Trihalomethanes[50,60) > 0: Potable (1.0)
| | | | | | | | | | | | Trihalomethanes[30,40) > 0: Not Potable (1.0)
| | | | | | | | | | | Hardness[160,200) > 0: Not Potable (2.0)
| | | | | | | | | | Sulfate[264,308) > 0
| | | | | | | | | | | Hardness[200,240) <= 0
| | | | | | | | | | | | Trihalomethanes[50,60) <= 0
| | | | | | | | | | | | | Organic_carbon[16,20) <= 0
| | | | | | | | | | | | | | Conductivity[500,600) <= 0
| | | | | | | | | | | | | | | Hardness[160,200) <= 0
| | | | | | | | | | | | | | | | Conductivity[300,400) <= 0: Not Potable (1.0)
| | | | | | | | | | | | | | | | Conductivity[300,400) > 0: Potable (2.0)
| | | | | | | | | | | | | | | Hardness[160,200) > 0
| | | | | | | | | | | | | | | | Trihalomethanes[100,110) <= 0
| | | | | | | | | | | | | | | | | Trihalomethanes[60,70) <= 0: Not Potable (2.0)
| | | | | | | | | | | | | | | | | Trihalomethanes[60,70) > 0
| | | | | | | | | | | | | | | | | | Chloramines[7,11) <= 0
| | | | | | | | | | | | | | | | | | | Organic_carbon[8,12) <= 0
| | | | | | | | | | | | | | | | | | | | Conductivity[300,400) <= 0: Potable (4.0/1.0)
| | | | | | | | | | | | | | | | | | | | Conductivity[300,400) > 0: Not Potable (1.0)
| | | | | | | | | | | | | | | | | | | Organic_carbon[8,12) > 0
| | | | | | | | | | | | | | | | | | | | Conductivity[300,400) <= 0: Not Potable (2.0)
| | | | | | | | | | | | | | | | | | | | Conductivity[300,400) > 0: Potable (1.0)
| | | | | | | | | | | | | | | | | | Chloramines[7,11) > 0
| | | | | | | | | | | | | | | | | | | Organic_carbon[8,12) <= 0: Not Potable (6.0)
| | | | | | | | | | | | | | | | | | | Organic_carbon[8,12) > 0: Potable (1.0)
| | | | | | | | | | | | | | | | Trihalomethanes[100,110) > 0: Potable (1.0)
| | | | | | | | | | | | | | Conductivity[500,600) > 0: Not Potable (6.0)
| | | | | | | | | | | | | Organic_carbon[16,20) > 0
| | | | | | | | | | | | | | Chloramines[7,11) <= 0: Potable (3.0)
| | | | | | | | | | | | | | Chloramines[7,11) > 0: Not Potable (3.0/1.0)
| | | | | | | | | | | | Trihalomethanes[50,60) > 0: Not Potable (11.0/1.0)
| | | | | | | | | | | Hardness[200,240) > 0
| | | | | | | | | | | | Trihalomethanes[60,70) <= 0
| | | | | | | | | | | | | Chloramines[7,11) <= 0: Not Potable (5.0)
| | | | | | | | | | | | | Chloramines[7,11) > 0
| | | | | | | | | | | | | | Organic_carbon[12,16) <= 0
| | | | | | | | | | | | | | | Conductivity[300,400) <= 0: Not Potable (5.0)
| | | | | | | | | | | | | | | Conductivity[300,400) > 0
| | | | | | | | | | | | | | | | Organic_carbon[8,12) <= 0: Potable (2.0)
| | | | | | | | | | | | | | | | Organic_carbon[8,12) > 0: Not Potable (1.0)
| | | | | | | | | | | | | | Organic_carbon[12,16) > 0: Potable (2.0)
| | | | | | | | | | | | Trihalomethanes[60,70) > 0: Not Potable (13.0)
| | | | | | | | | Hardness[240,280) > 0
| | | | | | | | | | Conductivity[500,600) <= 0: Not Potable (6.0)
| | | | | | | | | | Conductivity[500,600) > 0: Potable (1.0)
| | | | | | | | Trihalomethanes[80,90) > 0
| | | | | | | | | Conductivity[500,600) <= 0
| | | | | | | | | | Solids <= 0.807853: Not Potable (19.0)
| | | | | | | | | | Solids > 0.807853: Potable (1.0)
| | | | | | | | | Conductivity[500,600) > 0: Potable (2.0)
| | | | | | Trihalomethanes[70,80) > 0
| | | | | | | Turbidity[3,5) <= 0: Not Potable (7.0)
| | | | | | | Turbidity[3,5) > 0
| | | | | | | | Organic_carbon[16,20) <= 0
| | | | | | | | | Hardness[240,280) <= 0
| | | | | | | | | | Sulfate[264,308) <= 0: Potable (3.0)
| | | | | | | | | | Sulfate[264,308) > 0
| | | | | | | | | | | Conductivity[500,600) <= 0
| | | | | | | | | | | | Hardness[160,200) <= 0
| | | | | | | | | | | | | Organic_carbon[8,12) <= 0
| | | | | | | | | | | | | | Solids <= 0.394234: Potable (1.0)
| | | | | | | | | | | | | | Solids > 0.394234: Not Potable (6.0)
| | | | | | | | | | | | | Organic_carbon[8,12) > 0
| | | | | | | | | | | | | | Hardness[200,240) <= 0: Not Potable (1.0)
| | | | | | | | | | | | | | Hardness[200,240) > 0
| | | | | | | | | | | | | | | Solids <= 0.195778: Not Potable (1.0)
| | | | | | | | | | | | | | | Solids > 0.195778: Potable (3.0)
| | | | | | | | | | | | Hardness[160,200) > 0
| | | | | | | | | | | | | Conductivity[300,400) <= 0
| | | | | | | | | | | | | | Organic_carbon[8,12) <= 0
| | | | | | | | | | | | | | | Solids <= 0.33921: Not Potable (1.0)
| | | | | | | | | | | | | | | Solids > 0.33921: Potable (3.0)
| | | | | | | | | | | | | | Organic_carbon[8,12) > 0: Not Potable (2.0)
| | | | | | | | | | | | | Conductivity[300,400) > 0: Potable (2.0)
| | | | | | | | | | | Conductivity[500,600) > 0: Potable (4.0/1.0)
| | | | | | | | | Hardness[240,280) > 0: Not Potable (3.0)
| | | | | | | | Organic_carbon[16,20) > 0
| | | | | | | | | Conductivity[500,600) <= 0: Potable (6.0)
| | | | | | | | | Conductivity[500,600) > 0: Not Potable (1.0)
| | | | | Sulfate[352,396) > 0
| | | | | | Hardness[240,280) <= 0
| | | | | | | Chloramines[7,11) <= 0
| | | | | | | | Trihalomethanes[70,80) <= 0
| | | | | | | | | Turbidity[5,7) <= 0
| | | | | | | | | | Organic_carbon[12,16) <= 0
| | | | | | | | | | | Conductivity[500,600) <= 0
| | | | | | | | | | | | Turbidity[3,5) <= 0: Not Potable (1.0)
| | | | | | | | | | | | Turbidity[3,5) > 0
| | | | | | | | | | | | | Hardness[160,200) <= 0
| | | | | | | | | | | | | | Trihalomethanes[50,60) <= 0
| | | | | | | | | | | | | | | Trihalomethanes[60,70) <= 0: Potable (1.0)
| | | | | | | | | | | | | | | Trihalomethanes[60,70) > 0
| | | | | | | | | | | | | | | | Solids <= 0.377252: Not Potable (2.0)
| | | | | | | | | | | | | | | | Solids > 0.377252: Potable (3.0)
| | | | | | | | | | | | | | Trihalomethanes[50,60) > 0: Not Potable (2.0)
| | | | | | | | | | | | | Hardness[160,200) > 0
| | | | | | | | | | | | | | Trihalomethanes[60,70) <= 0: Potable (4.0)
| | | | | | | | | | | | | | Trihalomethanes[60,70) > 0
| | | | | | | | | | | | | | | Solids <= 0.421001: Not Potable (2.0)
| | | | | | | | | | | | | | | Solids > 0.421001: Potable (5.0)
| | | | | | | | | | | Conductivity[500,600) > 0
| | | | | | | | | | | | Organic_carbon[16,20) <= 0: Not Potable (4.0)
| | | | | | | | | | | | Organic_carbon[16,20) > 0: Potable (2.0)
| | | | | | | | | | Organic_carbon[12,16) > 0
| | | | | | | | | | | Trihalomethanes[80,90) <= 0
| | | | | | | | | | | | Turbidity[3,5) <= 0: Not Potable (1.0)
| | | | | | | | | | | | Turbidity[3,5) > 0: Potable (17.0/3.0)
| | | | | | | | | | | Trihalomethanes[80,90) > 0: Not Potable (1.0)
| | | | | | | | | Turbidity[5,7) > 0
| | | | | | | | | | Organic_carbon[12,16) <= 0: Potable (5.0)
| | | | | | | | | | Organic_carbon[12,16) > 0: Not Potable (1.0)
| | | | | | | | Trihalomethanes[70,80) > 0
| | | | | | | | | Conductivity[600,700) <= 0
| | | | | | | | | | Conductivity[400,500) <= 0
| | | | | | | | | | | Organic_carbon[16,20) <= 0
| | | | | | | | | | | | Conductivity[300,400) <= 0
| | | | | | | | | | | | | Organic_carbon[8,12) <= 0: Potable (3.0)
| | | | | | | | | | | | | Organic_carbon[8,12) > 0: Not Potable (1.0)
| | | | | | | | | | | | Conductivity[300,400) > 0
| | | | | | | | | | | | | Solids <= 0.673943: Not Potable (3.0)
| | | | | | | | | | | | | Solids > 0.673943: Potable (2.0)
| | | | | | | | | | | Organic_carbon[16,20) > 0: Potable (1.0)
| | | | | | | | | | Conductivity[400,500) > 0
| | | | | | | | | | | Turbidity[3,5) <= 0: Not Potable (1.0)
| | | | | | | | | | | Turbidity[3,5) > 0
| | | | | | | | | | | | Organic_carbon[8,12) <= 0
| | | | | | | | | | | | | Solids <= 0.201591: Potable (1.0)
| | | | | | | | | | | | | Solids > 0.201591: Not Potable (5.0)
| | | | | | | | | | | | Organic_carbon[8,12) > 0: Potable (1.0)
| | | | | | | | | Conductivity[600,700) > 0: Potable (1.0)
| | | | | | | Chloramines[7,11) > 0
| | | | | | | | Conductivity[300,400) <= 0
| | | | | | | | | Conductivity[500,600) <= 0
| | | | | | | | | | Organic_carbon[12,16) <= 0
| | | | | | | | | | | Conductivity[400,500) <= 0: Potable (4.0)
| | | | | | | | | | | Conductivity[400,500) > 0
| | | | | | | | | | | | Hardness[160,200) <= 0
| | | | | | | | | | | | | Organic_carbon[8,12) <= 0
| | | | | | | | | | | | | | Trihalomethanes[60,70) <= 0: Potable (4.0/1.0)
| | | | | | | | | | | | | | Trihalomethanes[60,70) > 0: Not Potable (1.0)
| | | | | | | | | | | | | Organic_carbon[8,12) > 0
| | | | | | | | | | | | | | Solids <= 0.482566: Potable (6.0)
| | | | | | | | | | | | | | Solids > 0.482566: Not Potable (1.0)
| | | | | | | | | | | | Hardness[160,200) > 0
| | | | | | | | | | | | | Trihalomethanes[50,60) <= 0
| | | | | | | | | | | | | | Trihalomethanes[90,100) <= 0
| | | | | | | | | | | | | | | Solids <= 0.561575: Potable (8.0/2.0)
| | | | | | | | | | | | | | | Solids > 0.561575: Not Potable (2.0)
| | | | | | | | | | | | | | Trihalomethanes[90,100) > 0: Not Potable (1.0)
| | | | | | | | | | | | | Trihalomethanes[50,60) > 0: Not Potable (2.0)
| | | | | | | | | | Organic_carbon[12,16) > 0
| | | | | | | | | | | Trihalomethanes[80,90) <= 0
| | | | | | | | | | | | Turbidity[3,5) <= 0: Not Potable (1.0)
| | | | | | | | | | | | Turbidity[3,5) > 0
| | | | | | | | | | | | | Hardness[160,200) <= 0
| | | | | | | | | | | | | | Hardness[200,240) <= 0: Not Potable (6.0)
| | | | | | | | | | | | | | Hardness[200,240) > 0: Potable (6.0/2.0)
| | | | | | | | | | | | | Hardness[160,200) > 0
| | | | | | | | | | | | | | Trihalomethanes[60,70) <= 0: Potable (3.0)
| | | | | | | | | | | | | | Trihalomethanes[60,70) > 0: Not Potable (1.0)
| | | | | | | | | | | Trihalomethanes[80,90) > 0: Not Potable (1.0)
| | | | | | | | | Conductivity[500,600) > 0
| | | | | | | | | | Organic_carbon[16,20) <= 0
| | | | | | | | | | | Organic_carbon[12,16) <= 0: Not Potable (9.0/1.0)
| | | | | | | | | | | Organic_carbon[12,16) > 0
| | | | | | | | | | | | Trihalomethanes[70,80) <= 0
| | | | | | | | | | | | | Turbidity[3,5) <= 0
| | | | | | | | | | | | | | Turbidity[5,7) <= 0: Not Potable (2.0)
| | | | | | | | | | | | | | Turbidity[5,7) > 0
| | | | | | | | | | | | | | | Solids <= 0.40018: Not Potable (1.0)
| | | | | | | | | | | | | | | Solids > 0.40018: Potable (3.0)
| | | | | | | | | | | | | Turbidity[3,5) > 0: Potable (4.0)
| | | | | | | | | | | | Trihalomethanes[70,80) > 0: Not Potable (1.0)
| | | | | | | | | | Organic_carbon[16,20) > 0: Potable (1.0)
| | | | | | | | Conductivity[300,400) > 0
| | | | | | | | | Trihalomethanes[100,110) <= 0
| | | | | | | | | | Organic_carbon[8,12) <= 0
| | | | | | | | | | | Turbidity[5,7) <= 0
| | | | | | | | | | | | Trihalomethanes[30,40) <= 0
| | | | | | | | | | | | | Hardness[200,240) <= 0: Not Potable (14.0/1.0)
| | | | | | | | | | | | | Hardness[200,240) > 0
| | | | | | | | | | | | | | Trihalomethanes[90,100) <= 0: Potable (4.0/1.0)
| | | | | | | | | | | | | | Trihalomethanes[90,100) > 0: Not Potable (1.0)
| | | | | | | | | | | | Trihalomethanes[30,40) > 0: Potable (1.0)
| | | | | | | | | | | Turbidity[5,7) > 0: Not Potable (1.0)
| | | | | | | | | | Organic_carbon[8,12) > 0: Not Potable (4.0)
| | | | | | | | | Trihalomethanes[100,110) > 0: Potable (1.0)
| | | | | | Hardness[240,280) > 0
| | | | | | | Conductivity[500,600) <= 0: Potable (8.0/1.0)
| | | | | | | Conductivity[500,600) > 0: Not Potable (1.0)
| | | | ph[7,11) > 0
| | | | | Sulfate[352,396) <= 0
| | | | | | Sulfate[264,308) <= 0
| | | | | | | Solids <= 0.233755: Not Potable (2.0)
| | | | | | | Solids > 0.233755
| | | | | | | | Hardness[240,280) <= 0: Potable (18.0/1.0)
| | | | | | | | Hardness[240,280) > 0: Not Potable (1.0)
| | | | | | Sulfate[264,308) > 0
| | | | | | | Hardness[200,240) <= 0
| | | | | | | | Chloramines[7,11) <= 0
| | | | | | | | | Trihalomethanes[50,60) <= 0
| | | | | | | | | | Solids <= 0.509695: Not Potable (20.0/4.0)
| | | | | | | | | | Solids > 0.509695
| | | | | | | | | | | Hardness[240,280) <= 0: Potable (17.0/2.0)
| | | | | | | | | | | Hardness[240,280) > 0
| | | | | | | | | | | | Organic_carbon[12,16) <= 0: Potable (2.0)
| | | | | | | | | | | | Organic_carbon[12,16) > 0: Not Potable (3.0)
| | | | | | | | | Trihalomethanes[50,60) > 0: Potable (4.0)
| | | | | | | | Chloramines[7,11) > 0
| | | | | | | | | Organic_carbon[12,16) <= 0
| | | | | | | | | | Conductivity[300,400) <= 0
| | | | | | | | | | | Trihalomethanes[90,100) <= 0
| | | | | | | | | | | | Solids <= 0.741689
| | | | | | | | | | | | | Conductivity[500,600) <= 0
| | | | | | | | | | | | | | Trihalomethanes[60,70) <= 0
| | | | | | | | | | | | | | | Turbidity[5,7) <= 0
| | | | | | | | | | | | | | | | Organic_carbon[8,12) <= 0: Potable (2.0)
| | | | | | | | | | | | | | | | Organic_carbon[8,12) > 0
| | | | | | | | | | | | | | | | | Hardness[160,200) <= 0: Potable (1.0)
| | | | | | | | | | | | | | | | | Hardness[160,200) > 0
| | | | | | | | | | | | | | | | | | Solids <= 0.445778: Potable (1.0)
| | | | | | | | | | | | | | | | | | Solids > 0.445778: Not Potable (2.0)
| | | | | | | | | | | | | | | Turbidity[5,7) > 0: Not Potable (1.0)
| | | | | | | | | | | | | | Trihalomethanes[60,70) > 0: Not Potable (2.0)
| | | | | | | | | | | | | Conductivity[500,600) > 0: Not Potable (3.0)
| | | | | | | | | | | | Solids > 0.741689: Potable (6.0)
| | | | | | | | | | | Trihalomethanes[90,100) > 0: Potable (1.0)
| | | | | | | | | | Conductivity[300,400) > 0: Potable (4.0)
| | | | | | | | | Organic_carbon[12,16) > 0: Potable (21.0/2.0)
| | | | | | | Hardness[200,240) > 0
| | | | | | | | Trihalomethanes[30,40) <= 0
| | | | | | | | | Trihalomethanes[80,90) <= 0
| | | | | | | | | | Organic_carbon[12,16) <= 0
| | | | | | | | | | | Trihalomethanes[70,80) <= 0
| | | | | | | | | | | | Conductivity[300,400) <= 0
| | | | | | | | | | | | | Trihalomethanes[60,70) <= 0
| | | | | | | | | | | | | | Organic_carbon[8,12) <= 0: Not Potable (6.0/1.0)
| | | | | | | | | | | | | | Organic_carbon[8,12) > 0: Potable (5.0)
| | | | | | | | | | | | | Trihalomethanes[60,70) > 0: Not Potable (7.0)
| | | | | | | | | | | | Conductivity[300,400) > 0
| | | | | | | | | | | | | Trihalomethanes[50,60) <= 0: Potable (8.0/1.0)
| | | | | | | | | | | | | Trihalomethanes[50,60) > 0: Not Potable (2.0)
| | | | | | | | | | | Trihalomethanes[70,80) > 0
| | | | | | | | | | | | Solids <= 0.819072: Not Potable (14.0)
| | | | | | | | | | | | Solids > 0.819072: Potable (1.0)
| | | | | | | | | | Organic_carbon[12,16) > 0
| | | | | | | | | | | Trihalomethanes[90,100) <= 0
| | | | | | | | | | | | Trihalomethanes[60,70) <= 0
| | | | | | | | | | | | | Chloramines[7,11) <= 0
| | | | | | | | | | | | | | Solids <= 0.520975: Not Potable (4.0/1.0)
| | | | | | | | | | | | | | Solids > 0.520975: Potable (4.0)
| | | | | | | | | | | | | Chloramines[7,11) > 0
| | | | | | | | | | | | | | Solids <= 0.807853: Potable (6.0/1.0)
| | | | | | | | | | | | | | Solids > 0.807853: Not Potable (1.0)
| | | | | | | | | | | | Trihalomethanes[60,70) > 0
| | | | | | | | | | | | | Conductivity[300,400) <= 0
| | | | | | | | | | | | | | Turbidity[3,5) <= 0: Not Potable (3.0)
| | | | | | | | | | | | | | Turbidity[3,5) > 0
| | | | | | | | | | | | | | | Solids <= 0.315273: Not Potable (2.0)
| | | | | | | | | | | | | | | Solids > 0.315273: Potable (3.0)
| | | | | | | | | | | | | Conductivity[300,400) > 0: Potable (3.0)
| | | | | | | | | | | Trihalomethanes[90,100) > 0: Not Potable (2.0)
| | | | | | | | | Trihalomethanes[80,90) > 0
| | | | | | | | | | Solids <= 0.388146: Not Potable (2.0)
| | | | | | | | | | Solids > 0.388146: Potable (5.0)
| | | | | | | | Trihalomethanes[30,40) > 0
| | | | | | | | | Organic_carbon[12,16) <= 0: Potable (4.0)
| | | | | | | | | Organic_carbon[12,16) > 0: Not Potable (1.0)
| | | | | Sulfate[352,396) > 0
| | | | | | Hardness[200,240) <= 0
| | | | | | | Trihalomethanes[30,40) <= 0
| | | | | | | | Turbidity[3,5) <= 0: Not Potable (22.0/2.0)
| | | | | | | | Turbidity[3,5) > 0
| | | | | | | | | Trihalomethanes[70,80) <= 0
| | | | | | | | | | Trihalomethanes[80,90) <= 0
| | | | | | | | | | | Hardness[160,200) <= 0
| | | | | | | | | | | | Chloramines[7,11) <= 0
| | | | | | | | | | | | | Conductivity[400,500) <= 0: Potable (5.0)
| | | | | | | | | | | | | Conductivity[400,500) > 0: Not Potable (4.0/1.0)
| | | | | | | | | | | | Chloramines[7,11) > 0
| | | | | | | | | | | | | Organic_carbon[16,20) <= 0
| | | | | | | | | | | | | | Hardness[240,280) <= 0
| | | | | | | | | | | | | | | Conductivity[300,400) <= 0
| | | | | | | | | | | | | | | | Solids <= 0.541894: Potable (3.0)
| | | | | | | | | | | | | | | | Solids > 0.541894: Not Potable (1.0)
| | | | | | | | | | | | | | | Conductivity[300,400) > 0: Not Potable (2.0)
| | | | | | | | | | | | | | Hardness[240,280) > 0: Not Potable (5.0)
| | | | | | | | | | | | | Organic_carbon[16,20) > 0: Potable (1.0)
| | | | | | | | | | | Hardness[160,200) > 0
| | | | | | | | | | | | Chloramines[7,11) <= 0
| | | | | | | | | | | | | Conductivity[400,500) <= 0
| | | | | | | | | | | | | | Trihalomethanes[90,100) <= 0
| | | | | | | | | | | | | | | Organic_carbon[8,12) <= 0
| | | | | | | | | | | | | | | | Conductivity[500,600) <= 0: Not Potable (3.0)
| | | | | | | | | | | | | | | | Conductivity[500,600) > 0
| | | | | | | | | | | | | | | | | Organic_carbon[12,16) <= 0: Potable (1.0)
| | | | | | | | | | | | | | | | | Organic_carbon[12,16) > 0
| | | | | | | | | | | | | | | | | | Solids <= 0.455149: Not Potable (2.0)
| | | | | | | | | | | | | | | | | | Solids > 0.455149: Potable (2.0)
| | | | | | | | | | | | | | | Organic_carbon[8,12) > 0
| | | | | | | | | | | | | | | | Solids <= 0.272424: Not Potable (1.0)
| | | | | | | | | | | | | | | | Solids > 0.272424: Potable (2.0)
| | | | | | | | | | | | | | Trihalomethanes[90,100) > 0: Not Potable (3.0)
| | | | | | | | | | | | | Conductivity[400,500) > 0: Not Potable (5.0)
| | | | | | | | | | | | Chloramines[7,11) > 0
| | | | | | | | | | | | | Trihalomethanes[90,100) <= 0
| | | | | | | | | | | | | | Conductivity[500,600) <= 0
| | | | | | | | | | | | | | | Trihalomethanes[60,70) <= 0: Not Potable (7.0/2.0)
| | | | | | | | | | | | | | | Trihalomethanes[60,70) > 0
| | | | | | | | | | | | | | | | Organic_carbon[8,12) <= 0
| | | | | | | | | | | | | | | | | Conductivity[300,400) <= 0
| | | | | | | | | | | | | | | | | | Organic_carbon[12,16) <= 0
| | | | | | | | | | | | | | | | | | | Solids <= 0.431273: Not Potable (1.0)
| | | | | | | | | | | | | | | | | | | Solids > 0.431273: Potable (1.0)
| | | | | | | | | | | | | | | | | | Organic_carbon[12,16) > 0
| | | | | | | | | | | | | | | | | | | Solids <= 0.43104: Potable (2.0)
| | | | | | | | | | | | | | | | | | | Solids > 0.43104: Not Potable (1.0)
| | | | | | | | | | | | | | | | | Conductivity[300,400) > 0: Not Potable (2.0)
| | | | | | | | | | | | | | | | Organic_carbon[8,12) > 0: Potable (1.0)
| | | | | | | | | | | | | | Conductivity[500,600) > 0: Not Potable (2.0)
| | | | | | | | | | | | | Trihalomethanes[90,100) > 0: Potable (2.0)
| | | | | | | | | | Trihalomethanes[80,90) > 0
| | | | | | | | | | | Hardness[160,200) <= 0: Not Potable (5.0)
| | | | | | | | | | | Hardness[160,200) > 0
| | | | | | | | | | | | Conductivity[500,600) <= 0
| | | | | | | | | | | | | Solids <= 0.630566: Not Potable (10.0/1.0)
| | | | | | | | | | | | | Solids > 0.630566: Potable (2.0)
| | | | | | | | | | | | Conductivity[500,600) > 0: Potable (1.0)
| | | | | | | | | Trihalomethanes[70,80) > 0: Not Potable (15.0/3.0)
| | | | | | | Trihalomethanes[30,40) > 0: Not Potable (7.0)
| | | | | | Hardness[200,240) > 0
| | | | | | | Chloramines[7,11) <= 0
| | | | | | | | Trihalomethanes[70,80) <= 0
| | | | | | | | | Organic_carbon[8,12) <= 0: Not Potable (22.0/5.0)
| | | | | | | | | Organic_carbon[8,12) > 0
| | | | | | | | | | Solids <= 0.586899: Potable (6.0)
| | | | | | | | | | Solids > 0.586899: Not Potable (2.0)
| | | | | | | | Trihalomethanes[70,80) > 0: Not Potable (5.0)
| | | | | | | Chloramines[7,11) > 0
| | | | | | | | Turbidity[3,5) <= 0
| | | | | | | | | Conductivity[500,600) <= 0: Potable (8.0)
| | | | | | | | | Conductivity[500,600) > 0: Not Potable (1.0)
| | | | | | | | Turbidity[3,5) > 0
| | | | | | | | | Conductivity[500,600) <= 0
| | | | | | | | | | Trihalomethanes[30,40) <= 0
| | | | | | | | | | | Organic_carbon[8,12) <= 0
| | | | | | | | | | | | Conductivity[400,500) <= 0
| | | | | | | | | | | | | Trihalomethanes[100,110) <= 0
| | | | | | | | | | | | | | Trihalomethanes[80,90) <= 0
| | | | | | | | | | | | | | | Conductivity[300,400) <= 0: Potable (1.0)
| | | | | | | | | | | | | | | Conductivity[300,400) > 0
| | | | | | | | | | | | | | | | Trihalomethanes[60,70) <= 0
| | | | | | | | | | | | | | | | | Organic_carbon[12,16) <= 0
| | | | | | | | | | | | | | | | | | Organic_carbon[16,20) <= 0: Not Potable (1.0)
| | | | | | | | | | | | | | | | | | Organic_carbon[16,20) > 0
| | | | | | | | | | | | | | | | | | | Solids <= 0.818584: Potable (3.0)
| | | | | | | | | | | | | | | | | | | Solids > 0.818584: Not Potable (1.0)
| | | | | | | | | | | | | | | | | Organic_carbon[12,16) > 0: Potable (2.0)
| | | | | | | | | | | | | | | | Trihalomethanes[60,70) > 0
| | | | | | | | | | | | | | | | | Solids <= 0.377252: Potable (1.0)
| | | | | | | | | | | | | | | | | Solids > 0.377252: Not Potable (2.0)
| | | | | | | | | | | | | | Trihalomethanes[80,90) > 0: Not Potable (1.0)
| | | | | | | | | | | | | Trihalomethanes[100,110) > 0: Not Potable (2.0)
| | | | | | | | | | | | Conductivity[400,500) > 0
| | | | | | | | | | | | | Trihalomethanes[80,90) <= 0
| | | | | | | | | | | | | | Trihalomethanes[70,80) <= 0
| | | | | | | | | | | | | | | Trihalomethanes[50,60) <= 0
| | | | | | | | | | | | | | | | Solids <= 0.535701: Not Potable (2.0)
| | | | | | | | | | | | | | | | Solids > 0.535701: Potable (2.0)
| | | | | | | | | | | | | | | Trihalomethanes[50,60) > 0: Not Potable (2.0)
| | | | | | | | | | | | | | Trihalomethanes[70,80) > 0: Not Potable (3.0)
| | | | | | | | | | | | | Trihalomethanes[80,90) > 0: Potable (1.0)
| | | | | | | | | | | Organic_carbon[8,12) > 0
| | | | | | | | | | | | Trihalomethanes[90,100) <= 0: Not Potable (4.0)
| | | | | | | | | | | | Trihalomethanes[90,100) > 0: Potable (1.0)
| | | | | | | | | | Trihalomethanes[30,40) > 0
| | | | | | | | | | | Organic_carbon[12,16) <= 0: Not Potable (1.0)
| | | | | | | | | | | Organic_carbon[12,16) > 0: Potable (2.0)
| | | | | | | | | Conductivity[500,600) > 0
| | | | | | | | | | Organic_carbon[8,12) <= 0: Potable (6.0)
| | | | | | | | | | Organic_carbon[8,12) > 0: Not Potable (1.0)
| | | Organic_carbon[20,24) > 0
| | | | Hardness[240,280) <= 0
| | | | | Turbidity[5,7) <= 0
| | | | | | Sulfate[264,308) <= 0
| | | | | | | Chloramines[7,11) <= 0: Not Potable (8.0/1.0)
| | | | | | | Chloramines[7,11) > 0
| | | | | | | | Turbidity[3,5) <= 0: Potable (2.0)
| | | | | | | | Turbidity[3,5) > 0
| | | | | | | | | Conductivity[400,500) <= 0
| | | | | | | | | | Trihalomethanes[100,110) <= 0: Not Potable (6.0/1.0)
| | | | | | | | | | Trihalomethanes[100,110) > 0: Potable (1.0)
| | | | | | | | | Conductivity[400,500) > 0: Potable (1.0)
| | | | | | Sulfate[264,308) > 0
| | | | | | | Hardness[200,240) <= 0: Potable (3.0)
| | | | | | | Hardness[200,240) > 0
| | | | | | | | Trihalomethanes[70,80) <= 0: Potable (1.0)
| | | | | | | | Trihalomethanes[70,80) > 0: Not Potable (2.0)
| | | | | Turbidity[5,7) > 0: Potable (3.0)
| | | | Hardness[240,280) > 0: Potable (3.0)
| | Trihalomethanes[40,50) > 0
| | | Organic_carbon[16,20) <= 0
| | | | ph[7,11) <= 0
| | | | | Turbidity[5,7) <= 0
| | | | | | Hardness[240,280) <= 0
| | | | | | | Conductivity[400,500) <= 0
| | | | | | | | Turbidity[3,5) <= 0: Potable (1.0)
| | | | | | | | Turbidity[3,5) > 0
| | | | | | | | | Organic_carbon[20,24) <= 0
| | | | | | | | | | Organic_carbon[12,16) <= 0: Potable (5.0/1.0)
| | | | | | | | | | Organic_carbon[12,16) > 0
| | | | | | | | | | | Chloramines[7,11) <= 0
| | | | | | | | | | | | Conductivity[300,400) <= 0
| | | | | | | | | | | | | Solids <= 0.62018: Not Potable (2.0)
| | | | | | | | | | | | | Solids > 0.62018: Potable (2.0)
| | | | | | | | | | | | Conductivity[300,400) > 0
| | | | | | | | | | | | | Solids <= 0.283695: Potable (1.0)
| | | | | | | | | | | | | Solids > 0.283695: Not Potable (4.0)
| | | | | | | | | | | Chloramines[7,11) > 0: Not Potable (6.0/1.0)
| | | | | | | | | Organic_carbon[20,24) > 0: Not Potable (1.0)
| | | | | | | Conductivity[400,500) > 0
| | | | | | | | Solids <= 0.394436: Potable (1.0)
| | | | | | | | Solids > 0.394436: Not Potable (5.0)
| | | | | | Hardness[240,280) > 0: Not Potable (2.0)
| | | | | Turbidity[5,7) > 0: Not Potable (3.0)
| | | | ph[7,11) > 0
| | | | | Hardness[240,280) <= 0
| | | | | | Organic_carbon[8,12) <= 0
| | | | | | | Organic_carbon[12,16) <= 0: Potable (2.0)
| | | | | | | Organic_carbon[12,16) > 0
| | | | | | | | Hardness[200,240) <= 0: Potable (7.0/2.0)
| | | | | | | | Hardness[200,240) > 0
| | | | | | | | | Chloramines[7,11) <= 0
| | | | | | | | | | Turbidity[3,5) <= 0
| | | | | | | | | | | Turbidity[5,7) <= 0: Not Potable (2.0)
| | | | | | | | | | | Turbidity[5,7) > 0
| | | | | | | | | | | | Conductivity[400,500) <= 0: Potable (2.0)
| | | | | | | | | | | | Conductivity[400,500) > 0: Not Potable (1.0)
| | | | | | | | | | Turbidity[3,5) > 0: Potable (2.0)
| | | | | | | | | Chloramines[7,11) > 0: Not Potable (2.0)
| | | | | | Organic_carbon[8,12) > 0
| | | | | | | Sulfate[264,308) <= 0: Not Potable (6.0)
| | | | | | | Sulfate[264,308) > 0: Potable (1.0)
| | | | | Hardness[240,280) > 0: Potable (2.0)
| | | Organic_carbon[16,20) > 0
| | | | Sulfate[264,308) <= 0
| | | | | Sulfate[352,396) <= 0: Potable (1.0)
| | | | | Sulfate[352,396) > 0
| | | | | | Hardness[160,200) <= 0
| | | | | | | Hardness[200,240) <= 0: Potable (2.0)
| | | | | | | Hardness[200,240) > 0
| | | | | | | | Solids <= 0.544185: Not Potable (3.0)
| | | | | | | | Solids > 0.544185: Potable (1.0)
| | | | | | Hardness[160,200) > 0: Not Potable (5.0)
| | | | Sulfate[264,308) > 0: Not Potable (8.0)
| Sulfate[308,352) > 0
| | Hardness[240,280) <= 0
| | | Trihalomethanes[90,100) <= 0
| | | | Hardness[160,200) <= 0
| | | | | Hardness[200,240) <= 0
| | | | | | ph[7,11) <= 0
| | | | | | | Trihalomethanes[60,70) <= 0
| | | | | | | | Conductivity[300,400) <= 0
| | | | | | | | | Organic_carbon[8,12) <= 0
| | | | | | | | | | Trihalomethanes[40,50) <= 0: Potable (11.0/1.0)
| | | | | | | | | | Trihalomethanes[40,50) > 0
| | | | | | | | | | | Conductivity[400,500) <= 0: Not Potable (4.0)
| | | | | | | | | | | Conductivity[400,500) > 0: Potable (1.0)
| | | | | | | | | Organic_carbon[8,12) > 0
| | | | | | | | | | Trihalomethanes[80,90) <= 0: Not Potable (4.0)
| | | | | | | | | | Trihalomethanes[80,90) > 0: Potable (1.0)
| | | | | | | | Conductivity[300,400) > 0
| | | | | | | | | Trihalomethanes[40,50) <= 0: Not Potable (9.0/1.0)
| | | | | | | | | Trihalomethanes[40,50) > 0: Potable (1.0)
| | | | | | | Trihalomethanes[60,70) > 0
| | | | | | | | Chloramines[7,11) <= 0
| | | | | | | | | Conductivity[300,400) <= 0
| | | | | | | | | | Solids <= 0.310076: Potable (1.0)
| | | | | | | | | | Solids > 0.310076: Not Potable (3.0)
| | | | | | | | | Conductivity[300,400) > 0: Potable (1.0)
| | | | | | | | Chloramines[7,11) > 0: Not Potable (6.0)
| | | | | | ph[7,11) > 0
| | | | | | | Trihalomethanes[40,50) <= 0
| | | | | | | | Turbidity[5,7) <= 0
| | | | | | | | | Trihalomethanes[80,90) <= 0: Potable (15.0/2.0)
| | | | | | | | | Trihalomethanes[80,90) > 0: Not Potable (3.0/1.0)
| | | | | | | | Turbidity[5,7) > 0: Not Potable (1.0)
| | | | | | | Trihalomethanes[40,50) > 0: Not Potable (2.0)
| | | | | Hardness[200,240) > 0
| | | | | | Chloramines[7,11) <= 0
| | | | | | | Turbidity[3,5) <= 0
| | | | | | | | Organic_carbon[16,20) <= 0: Not Potable (16.0)
| | | | | | | | Organic_carbon[16,20) > 0
| | | | | | | | | ph[7,11) <= 0: Potable (3.0)
| | | | | | | | | ph[7,11) > 0: Not Potable (2.0)
| | | | | | | Turbidity[3,5) > 0
| | | | | | | | Trihalomethanes[30,40) <= 0
| | | | | | | | | ph[7,11) <= 0
| | | | | | | | | | Trihalomethanes[50,60) <= 0
| | | | | | | | | | | Trihalomethanes[70,80) <= 0
| | | | | | | | | | | | Conductivity[500,600) <= 0
| | | | | | | | | | | | | Conductivity[600,700) <= 0
| | | | | | | | | | | | | | Trihalomethanes[100,110) <= 0
| | | | | | | | | | | | | | | Conductivity[400,500) <= 0
| | | | | | | | | | | | | | | | Trihalomethanes[80,90) <= 0: Potable (6.0/2.0)
| | | | | | | | | | | | | | | | Trihalomethanes[80,90) > 0: Not Potable (2.0)
| | | | | | | | | | | | | | | Conductivity[400,500) > 0
| | | | | | | | | | | | | | | | Trihalomethanes[80,90) <= 0: Not Potable (8.0/1.0)
| | | | | | | | | | | | | | | | Trihalomethanes[80,90) > 0: Potable (1.0)
| | | | | | | | | | | | | | Trihalomethanes[100,110) > 0: Potable (1.0)
| | | | | | | | | | | | | Conductivity[600,700) > 0: Potable (1.0)
| | | | | | | | | | | | Conductivity[500,600) > 0: Potable (4.0)
| | | | | | | | | | | Trihalomethanes[70,80) > 0
| | | | | | | | | | | | Organic_carbon[8,12) <= 0: Not Potable (14.0/3.0)
| | | | | | | | | | | | Organic_carbon[8,12) > 0: Potable (1.0)
| | | | | | | | | | Trihalomethanes[50,60) > 0
| | | | | | | | | | | Organic_carbon[12,16) <= 0: Not Potable (7.0)
| | | | | | | | | | | Organic_carbon[12,16) > 0: Potable (1.0)
| | | | | | | | | ph[7,11) > 0
| | | | | | | | | | Organic_carbon[16,20) <= 0
| | | | | | | | | | | Trihalomethanes[50,60) <= 0
| | | | | | | | | | | | Conductivity[400,500) <= 0: Not Potable (23.0/1.0)
| | | | | | | | | | | | Conductivity[400,500) > 0
| | | | | | | | | | | | | Organic_carbon[8,12) <= 0
| | | | | | | | | | | | | | Trihalomethanes[70,80) <= 0
| | | | | | | | | | | | | | | Solids <= 0.565998: Not Potable (9.0/2.0)
| | | | | | | | | | | | | | | Solids > 0.565998: Potable (2.0)
| | | | | | | | | | | | | | Trihalomethanes[70,80) > 0
| | | | | | | | | | | | | | | Solids <= 0.380268: Potable (1.0)
| | | | | | | | | | | | | | | Solids > 0.380268: Not Potable (2.0)
| | | | | | | | | | | | | Organic_carbon[8,12) > 0: Not Potable (2.0)
| | | | | | | | | | | Trihalomethanes[50,60) > 0
| | | | | | | | | | | | Organic_carbon[12,16) <= 0: Potable (3.0)
| | | | | | | | | | | | Organic_carbon[12,16) > 0: Not Potable (8.0/1.0)
| | | | | | | | | | Organic_carbon[16,20) > 0
| | | | | | | | | | | Trihalomethanes[70,80) <= 0
| | | | | | | | | | | | Conductivity[300,400) <= 0
| | | | | | | | | | | | | Trihalomethanes[40,50) <= 0
| | | | | | | | | | | | | | Solids <= 0.487788
| | | | | | | | | | | | | | | Solids <= 0.322012: Not Potable (1.0)
| | | | | | | | | | | | | | | Solids > 0.322012: Potable (3.0)
| | | | | | | | | | | | | | Solids > 0.487788: Not Potable (4.0)
| | | | | | | | | | | | | Trihalomethanes[40,50) > 0: Potable (1.0)
| | | | | | | | | | | | Conductivity[300,400) > 0: Not Potable (3.0)
| | | | | | | | | | | Trihalomethanes[70,80) > 0
| | | | | | | | | | | | Conductivity[500,600) <= 0: Potable (8.0/3.0)
| | | | | | | | | | | | Conductivity[500,600) > 0: Not Potable (1.0)
| | | | | | | | Trihalomethanes[30,40) > 0
| | | | | | | | | Organic_carbon[12,16) <= 0: Potable (2.0)
| | | | | | | | | Organic_carbon[12,16) > 0
| | | | | | | | | | Solids <= 0.577692: Not Potable (5.0)
| | | | | | | | | | Solids > 0.577692: Potable (2.0)
| | | | | | Chloramines[7,11) > 0
| | | | | | | Turbidity[5,7) <= 0
| | | | | | | | Conductivity[500,600) <= 0
| | | | | | | | | ph[7,11) <= 0
| | | | | | | | | | Trihalomethanes[40,50) <= 0
| | | | | | | | | | | Organic_carbon[8,12) <= 0
| | | | | | | | | | | | Trihalomethanes[30,40) <= 0
| | | | | | | | | | | | | Trihalomethanes[50,60) <= 0
| | | | | | | | | | | | | | Trihalomethanes[80,90) <= 0
| | | | | | | | | | | | | | | Turbidity[3,5) <= 0: Potable (2.0)
| | | | | | | | | | | | | | | Turbidity[3,5) > 0
| | | | | | | | | | | | | | | | Conductivity[400,500) <= 0
| | | | | | | | | | | | | | | | | Trihalomethanes[60,70) <= 0: Potable (3.0)
| | | | | | | | | | | | | | | | | Trihalomethanes[60,70) > 0
| | | | | | | | | | | | | | | | | | Conductivity[300,400) <= 0: Potable (2.0)
| | | | | | | | | | | | | | | | | | Conductivity[300,400) > 0: Not Potable (3.0)
| | | | | | | | | | | | | | | | Conductivity[400,500) > 0: Not Potable (10.0/3.0)
| | | | | | | | | | | | | | Trihalomethanes[80,90) > 0: Not Potable (8.0/2.0)
| | | | | | | | | | | | | Trihalomethanes[50,60) > 0: Not Potable (7.0/1.0)
| | | | | | | | | | | | Trihalomethanes[30,40) > 0: Not Potable (2.0)
| | | | | | | | | | | Organic_carbon[8,12) > 0: Not Potable (9.0/1.0)
| | | | | | | | | | Trihalomethanes[40,50) > 0: Not Potable (5.0)
| | | | | | | | | ph[7,11) > 0
| | | | | | | | | | Trihalomethanes[40,50) <= 0
| | | | | | | | | | | Trihalomethanes[30,40) <= 0
| | | | | | | | | | | | Trihalomethanes[50,60) <= 0
| | | | | | | | | | | | | Organic_carbon[20,24) <= 0
| | | | | | | | | | | | | | Organic_carbon[16,20) <= 0
| | | | | | | | | | | | | | | Organic_carbon[12,16) <= 0: Not Potable (8.0)
| | | | | | | | | | | | | | | Organic_carbon[12,16) > 0
| | | | | | | | | | | | | | | | Trihalomethanes[70,80) <= 0
| | | | | | | | | | | | | | | | | Solids <= 0.357857: Potable (6.0/1.0)
| | | | | | | | | | | | | | | | | Solids > 0.357857
| | | | | | | | | | | | | | | | | | Conductivity[400,500) <= 0
| | | | | | | | | | | | | | | | | | | Conductivity[300,400) <= 0: Potable (1.0)
| | | | | | | | | | | | | | | | | | | Conductivity[300,400) > 0
| | | | | | | | | | | | | | | | | | | | Solids <= 0.652077: Not Potable (3.0)
| | | | | | | | | | | | | | | | | | | | Solids > 0.652077: Potable (1.0)
| | | | | | | | | | | | | | | | | | Conductivity[400,500) > 0: Not Potable (4.0)
| | | | | | | | | | | | | | | | Trihalomethanes[70,80) > 0: Not Potable (10.0/2.0)
| | | | | | | | | | | | | | Organic_carbon[16,20) > 0
| | | | | | | | | | | | | | | Trihalomethanes[60,70) <= 0
| | | | | | | | | | | | | | | | Trihalomethanes[70,80) <= 0: Not Potable (4.0/1.0)
| | | | | | | | | | | | | | | | Trihalomethanes[70,80) > 0: Potable (2.0)
| | | | | | | | | | | | | | | Trihalomethanes[60,70) > 0: Potable (2.0)
| | | | | | | | | | | | | Organic_carbon[20,24) > 0: Potable (2.0)
| | | | | | | | | | | | Trihalomethanes[50,60) > 0
| | | | | | | | | | | | | Organic_carbon[16,20) <= 0
| | | | | | | | | | | | | | Organic_carbon[8,12) <= 0
| | | | | | | | | | | | | | | Conductivity[400,500) <= 0: Not Potable (2.0)
| | | | | | | | | | | | | | | Conductivity[400,500) > 0
| | | | | | | | | | | | | | | | Solids <= 0.585777: Potable (2.0)
| | | | | | | | | | | | | | | | Solids > 0.585777: Not Potable (1.0)
| | | | | | | | | | | | | | Organic_carbon[8,12) > 0: Potable (6.0/1.0)
| | | | | | | | | | | | | Organic_carbon[16,20) > 0: Not Potable (3.0)
| | | | | | | | | | | Trihalomethanes[30,40) > 0
| | | | | | | | | | | | Solids <= 0.570578: Not Potable (3.0)
| | | | | | | | | | | | Solids > 0.570578: Potable (1.0)
| | | | | | | | | | Trihalomethanes[40,50) > 0
| | | | | | | | | | | Organic_carbon[8,12) <= 0
| | | | | | | | | | | | Conductivity[300,400) <= 0: Potable (3.0)
| | | | | | | | | | | | Conductivity[300,400) > 0
| | | | | | | | | | | | | Organic_carbon[12,16) <= 0
| | | | | | | | | | | | | | Solids <= 0.378233: Not Potable (1.0)
| | | | | | | | | | | | | | Solids > 0.378233: Potable (2.0)
| | | | | | | | | | | | | Organic_carbon[12,16) > 0: Not Potable (1.0)
| | | | | | | | | | | Organic_carbon[8,12) > 0: Not Potable (1.0)
| | | | | | | | Conductivity[500,600) > 0: Not Potable (23.0/4.0)
| | | | | | | Turbidity[5,7) > 0
| | | | | | | | Conductivity[400,500) <= 0
| | | | | | | | | Organic_carbon[16,20) <= 0
| | | | | | | | | | Trihalomethanes[70,80) <= 0: Potable (8.0/1.0)
| | | | | | | | | | Trihalomethanes[70,80) > 0: Not Potable (3.0/1.0)
| | | | | | | | | Organic_carbon[16,20) > 0: Not Potable (1.0)
| | | | | | | | Conductivity[400,500) > 0: Not Potable (4.0)
| | | | Hardness[160,200) > 0
| | | | | Trihalomethanes[50,60) <= 0
| | | | | | Conductivity[500,600) <= 0
| | | | | | | Turbidity[3,5) <= 0: Not Potable (43.0/7.0)
| | | | | | | Turbidity[3,5) > 0
| | | | | | | | Trihalomethanes[40,50) <= 0
| | | | | | | | | Trihalomethanes[80,90) <= 0
| | | | | | | | | | Organic_carbon[8,12) <= 0
| | | | | | | | | | | Trihalomethanes[60,70) <= 0
| | | | | | | | | | | | Organic_carbon[16,20) <= 0
| | | | | | | | | | | | | Chloramines[7,11) <= 0
| | | | | | | | | | | | | | Conductivity[400,500) <= 0: Not Potable (6.0/1.0)
| | | | | | | | | | | | | | Conductivity[400,500) > 0
| | | | | | | | | | | | | | | Solids <= 0.506183: Not Potable (5.0)
| | | | | | | | | | | | | | | Solids > 0.506183: Potable (5.0/1.0)
| | | | | | | | | | | | | Chloramines[7,11) > 0: Not Potable (17.0/2.0)
| | | | | | | | | | | | Organic_carbon[16,20) > 0
| | | | | | | | | | | | | ph[7,11) <= 0: Not Potable (9.0/2.0)
| | | | | | | | | | | | | ph[7,11) > 0
| | | | | | | | | | | | | | Chloramines[7,11) <= 0: Not Potable (6.0/1.0)
| | | | | | | | | | | | | | Chloramines[7,11) > 0: Potable (3.0)
| | | | | | | | | | | Trihalomethanes[60,70) > 0
| | | | | | | | | | | | Organic_carbon[16,20) <= 0
| | | | | | | | | | | | | Conductivity[400,500) <= 0
| | | | | | | | | | | | | | Conductivity[600,700) <= 0
| | | | | | | | | | | | | | | ph[7,11) <= 0: Not Potable (10.0/3.0)
| | | | | | | | | | | | | | | ph[7,11) > 0: Potable (8.0/2.0)
| | | | | | | | | | | | | | Conductivity[600,700) > 0: Potable (1.0)
| | | | | | | | | | | | | Conductivity[400,500) > 0
| | | | | | | | | | | | | | Solids <= 0.757621: Not Potable (13.0/3.0)
| | | | | | | | | | | | | | Solids > 0.757621: Potable (2.0)
| | | | | | | | | | | | Organic_carbon[16,20) > 0
| | | | | | | | | | | | | Solids <= 0.54981: Not Potable (15.0)
| | | | | | | | | | | | | Solids > 0.54981
| | | | | | | | | | | | | | ph[7,11) <= 0
| | | | | | | | | | | | | | | Chloramines[7,11) <= 0
| | | | | | | | | | | | | | | | Solids <= 0.713875: Potable (2.0)
| | | | | | | | | | | | | | | | Solids > 0.713875: Not Potable (1.0)
| | | | | | | | | | | | | | | Chloramines[7,11) > 0: Not Potable (1.0)
| | | | | | | | | | | | | | ph[7,11) > 0: Potable (2.0)
| | | | | | | | | | Organic_carbon[8,12) > 0
| | | | | | | | | | | ph[7,11) <= 0: Not Potable (11.0/1.0)
| | | | | | | | | | | ph[7,11) > 0
| | | | | | | | | | | | Solids <= 0.575729: Not Potable (11.0)
| | | | | | | | | | | | Solids > 0.575729
| | | | | | | | | | | | | Conductivity[300,400) <= 0
| | | | | | | | | | | | | | Solids <= 0.801307: Not Potable (2.0)
| | | | | | | | | | | | | | Solids > 0.801307: Potable (1.0)
| | | | | | | | | | | | | Conductivity[300,400) > 0: Potable (2.0)
| | | | | | | | | Trihalomethanes[80,90) > 0
| | | | | | | | | | Organic_carbon[8,12) <= 0
| | | | | | | | | | | Organic_carbon[12,16) <= 0
| | | | | | | | | | | | Conductivity[400,500) <= 0
| | | | | | | | | | | | | ph[7,11) <= 0: Not Potable (4.0/1.0)
| | | | | | | | | | | | | ph[7,11) > 0: Potable (4.0)
| | | | | | | | | | | | Conductivity[400,500) > 0
| | | | | | | | | | | | | ph[7,11) <= 0
| | | | | | | | | | | | | | Solids <= 0.594774: Potable (2.0)
| | | | | | | | | | | | | | Solids > 0.594774: Not Potable (1.0)
| | | | | | | | | | | | | ph[7,11) > 0: Not Potable (3.0)
| | | | | | | | | | | Organic_carbon[12,16) > 0
| | | | | | | | | | | | Chloramines[7,11) <= 0
| | | | | | | | | | | | | Solids <= 0.501397: Not Potable (4.0)
| | | | | | | | | | | | | Solids > 0.501397: Potable (2.0)
| | | | | | | | | | | | Chloramines[7,11) > 0: Not Potable (7.0)
| | | | | | | | | | Organic_carbon[8,12) > 0: Potable (2.0)
| | | | | | | | Trihalomethanes[40,50) > 0
| | | | | | | | | Organic_carbon[8,12) <= 0
| | | | | | | | | | Chloramines[7,11) <= 0
| | | | | | | | | | | Solids <= 0.764996: Not Potable (10.0/1.0)
| | | | | | | | | | | Solids > 0.764996
| | | | | | | | | | | | Solids <= 0.883745: Potable (2.0)
| | | | | | | | | | | | Solids > 0.883745: Not Potable (1.0)
| | | | | | | | | | Chloramines[7,11) > 0
| | | | | | | | | | | Organic_carbon[20,24) <= 0
| | | | | | | | | | | | Organic_carbon[12,16) <= 0
| | | | | | | | | | | | | Solids <= 0.377252: Potable (1.0)
| | | | | | | | | | | | | Solids > 0.377252: Not Potable (4.0)
| | | | | | | | | | | | Organic_carbon[12,16) > 0
| | | | | | | | | | | | | ph[7,11) <= 0
| | | | | | | | | | | | | | Solids <= 0.673943: Not Potable (3.0)
| | | | | | | | | | | | | | Solids > 0.673943: Potable (1.0)
| | | | | | | | | | | | | ph[7,11) > 0: Potable (4.0/1.0)
| | | | | | | | | | | Organic_carbon[20,24) > 0: Potable (1.0)
| | | | | | | | | Organic_carbon[8,12) > 0
| | | | | | | | | | Conductivity[300,400) <= 0: Not Potable (3.0/1.0)
| | | | | | | | | | Conductivity[300,400) > 0: Potable (3.0)
| | | | | | Conductivity[500,600) > 0
| | | | | | | Organic_carbon[8,12) <= 0
| | | | | | | | Organic_carbon[12,16) <= 0
| | | | | | | | | Solids <= 0.733408: Not Potable (17.0/2.0)
| | | | | | | | | Solids > 0.733408: Potable (1.0)
| | | | | | | | Organic_carbon[12,16) > 0
| | | | | | | | | Trihalomethanes[80,90) <= 0: Not Potable (24.0/4.0)
| | | | | | | | | Trihalomethanes[80,90) > 0
| | | | | | | | | | ph[7,11) <= 0: Not Potable (2.0)
| | | | | | | | | | ph[7,11) > 0: Potable (2.0)
| | | | | | | Organic_carbon[8,12) > 0: Not Potable (9.0)
| | | | | Trihalomethanes[50,60) > 0
| | | | | | Conductivity[500,600) <= 0
| | | | | | | Organic_carbon[16,20) <= 0
| | | | | | | | Solids <= 0.638043
| | | | | | | | | Organic_carbon[20,24) <= 0
| | | | | | | | | | Chloramines[7,11) <= 0
| | | | | | | | | | | Turbidity[3,5) <= 0
| | | | | | | | | | | | ph[7,11) <= 0: Potable (2.0)
| | | | | | | | | | | | ph[7,11) > 0: Not Potable (1.0)
| | | | | | | | | | | Turbidity[3,5) > 0
| | | | | | | | | | | | ph[7,11) <= 0: Not Potable (10.0/2.0)
| | | | | | | | | | | | ph[7,11) > 0
| | | | | | | | | | | | | Solids <= 0.413997: Not Potable (2.0)
| | | | | | | | | | | | | Solids > 0.413997
| | | | | | | | | | | | | | Solids <= 0.461826: Potable (2.0)
| | | | | | | | | | | | | | Solids > 0.461826: Not Potable (1.0)
| | | | | | | | | | Chloramines[7,11) > 0
| | | | | | | | | | | Conductivity[400,500) <= 0
| | | | | | | | | | | | ph[7,11) <= 0
| | | | | | | | | | | | | Organic_carbon[8,12) <= 0: Potable (3.0)
| | | | | | | | | | | | | Organic_carbon[8,12) > 0: Not Potable (3.0)
| | | | | | | | | | | | ph[7,11) > 0
| | | | | | | | | | | | | Solids <= 0.622257: Potable (2.0)
| | | | | | | | | | | | | Solids > 0.622257: Not Potable (1.0)
| | | | | | | | | | | Conductivity[400,500) > 0
| | | | | | | | | | | | Organic_carbon[8,12) <= 0
| | | | | | | | | | | | | Turbidity[3,5) <= 0: Not Potable (2.0)
| | | | | | | | | | | | | Turbidity[3,5) > 0
| | | | | | | | | | | | | | ph[7,11) <= 0
| | | | | | | | | | | | | | | Solids <= 0.475691: Not Potable (3.0)
| | | | | | | | | | | | | | | Solids > 0.475691: Potable (1.0)
| | | | | | | | | | | | | | ph[7,11) > 0: Potable (1.0)
| | | | | | | | | | | | Organic_carbon[8,12) > 0: Potable (2.0)
| | | | | | | | | Organic_carbon[20,24) > 0: Not Potable (2.0)
| | | | | | | | Solids > 0.638043: Not Potable (9.0)
| | | | | | | Organic_carbon[16,20) > 0
| | | | | | | | Chloramines[7,11) <= 0: Not Potable (9.0/3.0)
| | | | | | | | Chloramines[7,11) > 0
| | | | | | | | | Conductivity[300,400) <= 0
| | | | | | | | | | ph[7,11) <= 0: Potable (2.0)
| | | | | | | | | | ph[7,11) > 0: Not Potable (2.0)
| | | | | | | | | Conductivity[300,400) > 0: Potable (3.0)
| | | | | | Conductivity[500,600) > 0
| | | | | | | Turbidity[3,5) <= 0: Not Potable (2.0)
| | | | | | | Turbidity[3,5) > 0
| | | | | | | | Organic_carbon[20,24) <= 0
| | | | | | | | | Organic_carbon[12,16) <= 0
| | | | | | | | | | ph[7,11) <= 0: Not Potable (2.0)
| | | | | | | | | | ph[7,11) > 0: Potable (3.0/1.0)
| | | | | | | | | Organic_carbon[12,16) > 0
| | | | | | | | | | Solids <= 0.426479: Not Potable (1.0)
| | | | | | | | | | Solids > 0.426479: Potable (3.0)
| | | | | | | | Organic_carbon[20,24) > 0: Potable (1.0)
| | | Trihalomethanes[90,100) > 0
| | | | Conductivity[300,400) <= 0
| | | | | Solids <= 0.707437
| | | | | | Organic_carbon[16,20) <= 0
| | | | | | | Conductivity[400,500) <= 0
| | | | | | | | Solids <= 0.300286: Not Potable (1.0)
| | | | | | | | Solids > 0.300286: Potable (3.0)
| | | | | | | Conductivity[400,500) > 0
| | | | | | | | Solids <= 0.269225: Potable (2.0)
| | | | | | | | Solids > 0.269225
| | | | | | | | | Solids <= 0.593855: Not Potable (4.0)
| | | | | | | | | Solids > 0.593855: Potable (1.0)
| | | | | | Organic_carbon[16,20) > 0: Potable (6.0/1.0)
| | | | | Solids > 0.707437: Not Potable (4.0)
| | | | Conductivity[300,400) > 0: Not Potable (10.0/2.0)
| | Hardness[240,280) > 0
| | | ph[7,11) <= 0
| | | | Trihalomethanes[30,40) <= 0
| | | | | Trihalomethanes[60,70) <= 0: Potable (9.0/1.0)
| | | | | Trihalomethanes[60,70) > 0: Not Potable (6.0/2.0)
| | | | Trihalomethanes[30,40) > 0: Not Potable (1.0)
| | | ph[7,11) > 0
| | | | Trihalomethanes[60,70) <= 0
| | | | | Turbidity[5,7) <= 0
| | | | | | Chloramines[7,11) <= 0
| | | | | | | Solids <= 0.48065: Not Potable (8.0)
| | | | | | | Solids > 0.48065
| | | | | | | | Trihalomethanes[70,80) <= 0: Potable (3.0)
| | | | | | | | Trihalomethanes[70,80) > 0: Not Potable (1.0)
| | | | | | Chloramines[7,11) > 0
| | | | | | | Conductivity[300,400) <= 0
| | | | | | | | Trihalomethanes[40,50) <= 0: Potable (4.0)
| | | | | | | | Trihalomethanes[40,50) > 0: Not Potable (1.0)
| | | | | | | Conductivity[300,400) > 0: Not Potable (1.0)
| | | | | Turbidity[5,7) > 0: Not Potable (2.0)
| | | | Trihalomethanes[60,70) > 0: Not Potable (2.0)
Sulfate[396,440) > 0
| Trihalomethanes[90,100) <= 0
| | ph[7,11) <= 0
| | | Chloramines[7,11) <= 0
| | | | Trihalomethanes[80,90) <= 0
| | | | | Trihalomethanes[50,60) <= 0
| | | | | | Turbidity[3,5) <= 0: Not Potable (1.0)
| | | | | | Turbidity[3,5) > 0: Potable (11.0/1.0)
| | | | | Trihalomethanes[50,60) > 0: Not Potable (2.0)
| | | | Trihalomethanes[80,90) > 0: Not Potable (3.0)
| | | Chloramines[7,11) > 0: Potable (24.0/2.0)
| | ph[7,11) > 0
| | | Turbidity[5,7) <= 0
| | | | Trihalomethanes[50,60) <= 0
| | | | | Trihalomethanes[40,50) <= 0
| | | | | | Hardness[200,240) <= 0: Not Potable (14.0/1.0)
| | | | | | Hardness[200,240) > 0
| | | | | | | Chloramines[7,11) <= 0: Not Potable (6.0/1.0)
| | | | | | | Chloramines[7,11) > 0: Potable (4.0)
| | | | | Trihalomethanes[40,50) > 0
| | | | | | Hardness[160,200) <= 0: Not Potable (1.0)
| | | | | | Hardness[160,200) > 0: Potable (2.0)
| | | | Trihalomethanes[50,60) > 0
| | | | | Conductivity[400,500) <= 0: Potable (3.0)
| | | | | Conductivity[400,500) > 0
| | | | | | Turbidity[3,5) <= 0: Potable (1.0)
| | | | | | Turbidity[3,5) > 0: Not Potable (2.0)
| | | Turbidity[5,7) > 0: Not Potable (2.0)
| Trihalomethanes[90,100) > 0: Not Potable (4.0)
Number of Leaves : 437
Size of the tree : 873
pred_probs <- predict(C45Fit, newdata = wp, type = "prob")[, "Potable"]
binary_outcome <- as.numeric(wp$Potability == "Potable")
# ROC curve
roc_curve <- roc(binary_outcome, pred_probs)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
plot(roc_curve, main = "ROC Curve", col = "blue", lwd = 2)
abline(a = 0, b = 1, col = "gray", lty = 2)
# Print AUC
cat("AUC:", auc(roc_curve), "\n")
AUC: 0.9692406
C5.0 newer version of C4.5 Splitting the data set into two subsets: Training(70%) and Testing(30%):
set.seed(1958)
train.indices <- sample(2, nrow(water_potability), replace=TRUE, prob=c(0.7, 0.3))
w.train <- water_potability[train.indices == 1, ]
w.test <- water_potability[train.indices == 2, ]
w.train$Potability <- as.factor(w.train$Potability)
model <- C5.0(Potability ~., data=w.train)
results <- predict(object=model, newdata=w.test, type="class")
table(results, w.test$Potability)
results Not Potable Potable
Not Potable 290 195
Potable 21 21
plot(model)
r <- confusionMatrix(results, w.test$Potability)
acc <- r$overall["Accuracy"]*100
acc
Accuracy
59.01328
as.matrix(r, what = "classes")
[,1]
Sensitivity 0.93247588
Specificity 0.09722222
Pos Pred Value 0.59793814
Neg Pred Value 0.50000000
Precision 0.59793814
Recall 0.93247588
F1 0.72864322
Prevalence 0.59013283
Detection Rate 0.55028463
Detection Prevalence 0.92030361
Balanced Accuracy 0.51484905
print(r)
Confusion Matrix and Statistics
Reference
Prediction Not Potable Potable
Not Potable 290 195
Potable 21 21
Accuracy : 0.5901
95% CI : (0.5468, 0.6325)
No Information Rate : 0.5901
P-Value [Acc > NIR] : 0.5187
Kappa : 0.0339
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.93248
Specificity : 0.09722
Pos Pred Value : 0.59794
Neg Pred Value : 0.50000
Prevalence : 0.59013
Detection Rate : 0.55028
Detection Prevalence : 0.92030
Balanced Accuracy : 0.51485
'Positive' Class : Not Potable
pred_probs <- predict(model, newdata = w.test, type = "prob")[, "Potable"]
binary_outcome <- as.numeric(w.test$Potability == "Potable")
# ROC curve
roc_curve <- roc(binary_outcome, pred_probs)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
plot(roc_curve, main = "ROC Curve", col = "blue", lwd = 2)
abline(a = 0, b = 1, col = "gray", lty = 2)
# Print AUC
cat("AUC:", auc(roc_curve), "\n")
AUC: 0.515318
Splitting the data set into two subsets: Training(80%) and Testing(20%):
set.seed(1958)
train.indices <- sample(2, nrow(water_potability), replace=TRUE, prob=c(0.8, 0.2))
w.train <- water_potability[train.indices == 1, ]
w.test <- water_potability[train.indices == 2, ]
w.train$Potability <- as.factor(w.train$Potability)
model <- C5.0(Potability ~., data=w.train)
results <- predict(object=model, newdata=w.test, type="class")
table(results, w.test$Potability)
results Not Potable Potable
Not Potable 189 143
Potable 11 14
plot(model)
r <- confusionMatrix(results, w.test$Potability)
acc <- r$overall["Accuracy"]*100
acc
Accuracy
56.86275
as.matrix(r, what = "classes")
[,1]
Sensitivity 0.94500000
Specificity 0.08917197
Pos Pred Value 0.56927711
Neg Pred Value 0.56000000
Precision 0.56927711
Recall 0.94500000
F1 0.71052632
Prevalence 0.56022409
Detection Rate 0.52941176
Detection Prevalence 0.92997199
Balanced Accuracy 0.51708599
print(r)
Confusion Matrix and Statistics
Reference
Prediction Not Potable Potable
Not Potable 189 143
Potable 11 14
Accuracy : 0.5686
95% CI : (0.5155, 0.6206)
No Information Rate : 0.5602
P-Value [Acc > NIR] : 0.3957
Kappa : 0.0376
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.94500
Specificity : 0.08917
Pos Pred Value : 0.56928
Neg Pred Value : 0.56000
Prevalence : 0.56022
Detection Rate : 0.52941
Detection Prevalence : 0.92997
Balanced Accuracy : 0.51709
'Positive' Class : Not Potable
pred_probs <- predict(model, newdata = w.test, type = "prob")[, "Potable"]
binary_outcome <- as.numeric(w.test$Potability == "Potable")
# ROC curve
roc_curve <- roc(binary_outcome, pred_probs)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
plot(roc_curve, main = "ROC Curve", col = "blue", lwd = 2)
abline(a = 0, b = 1, col = "gray", lty = 2)
# Print AUC
cat("AUC:", auc(roc_curve), "\n")
AUC: 0.5175796
Splitting the data set into two subsets: Training(90%) and Testing(10%):
set.seed(1958)
train.indices <- sample(2, nrow(water_potability), replace=TRUE, prob=c(0.9, 0.1))
w.train <- water_potability[train.indices == 1, ]
w.test <- water_potability[train.indices == 2, ]
w.train$Potability <- as.factor(w.train$Potability)
model <- C5.0(Potability ~., data=w.train)
results <- predict(object=model, newdata=w.test, type="class")
table(results, w.test$Potability)
results Not Potable Potable
Not Potable 92 48
Potable 12 25
plot(model)
To improve the readability of the decision tree, we decided to sample the data using only the pH and sulfate attributes. We then split the data into training and testing sets using the same split points: Training(90%) and Testing(10%), which allowed for a more manageable decision tree:
set.seed(1958)
importent_feature_sample <- select(water_potability,c(1,5,10))
train.indices <- sample(2, nrow(importent_feature_sample), replace=TRUE, prob=c(0.9, 0.1))
w.train <- importent_feature_sample[train.indices == 1, ]
w.test <- importent_feature_sample[train.indices == 2, ]
w.train$Potability <- as.factor(w.train$Potability)
model <- C5.0(Potability ~., data=w.train)
results <- predict(object=model, newdata=w.test, type="class")
table(results, w.test$Potability)
results Not Potable Potable
Not Potable 92 55
Potable 12 18
plot(model)
r <- confusionMatrix(results, w.test$Potability)
acc <- r$overall["Accuracy"]*100
acc
Accuracy
62.14689
as.matrix(r, what = "classes")
[,1]
Sensitivity 0.8846154
Specificity 0.2465753
Pos Pred Value 0.6258503
Neg Pred Value 0.6000000
Precision 0.6258503
Recall 0.8846154
F1 0.7330677
Prevalence 0.5875706
Detection Rate 0.5197740
Detection Prevalence 0.8305085
Balanced Accuracy 0.5655954
print(r)
Confusion Matrix and Statistics
Reference
Prediction Not Potable Potable
Not Potable 92 55
Potable 12 18
Accuracy : 0.6215
95% CI : (0.5456, 0.6932)
No Information Rate : 0.5876
P-Value [Acc > NIR] : 0.2009
Kappa : 0.1438
Mcnemar's Test P-Value : 2.88e-07
Sensitivity : 0.8846
Specificity : 0.2466
Pos Pred Value : 0.6259
Neg Pred Value : 0.6000
Prevalence : 0.5876
Detection Rate : 0.5198
Detection Prevalence : 0.8305
Balanced Accuracy : 0.5656
'Positive' Class : Not Potable
pred_probs <- predict(model, newdata = w.test, type = "prob")[, "Potable"]
binary_outcome <- as.numeric(w.test$Potability == "Potable")
# ROC curve
roc_curve <- roc(binary_outcome, pred_probs)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
plot(roc_curve, main = "ROC Curve", col = "blue", lwd = 2)
abline(a = 0, b = 1, col = "gray", lty = 2)
# Print AUC
cat("AUC:", auc(roc_curve), "\n")
AUC: 0.5474842
Gini index (CART) Splitting the data set into two subsets: Training(70%) and Testing(30%):
set.seed(1958)
train = sample(2, nrow(wp), replace=TRUE, prob=c(0.7, 0.3))
wp.train=wp[train == 1,]
wp.test=wp[train == 2,]
fit.tree = rpart(Potability ~ ., data=wp, method = "class", cp=0.008)
fit.tree
n= 1750
node), split, n, loss, yval, (yprob)
* denotes terminal node
1) root 1750 684 Not Potable (0.6091429 0.3908571)
2) Sulfate=[308,352) 806 263 Not Potable (0.6736973 0.3263027) *
3) Sulfate=[220,264),[264,308),[352,396),[396,440) 944 421 Not Potable (0.5540254 0.4459746)
6) Sulfate=[264,308),[352,396) 819 349 Not Potable (0.5738706 0.4261294)
12) Solids< 0.7847186 758 312 Not Potable (0.5883905 0.4116095) *
13) Solids>=0.7847186 61 24 Potable (0.3934426 0.6065574) *
7) Sulfate=[220,264),[396,440) 125 53 Potable (0.4240000 0.5760000) *
rpart.plot(fit.tree)
fit.tree$variable.importance
Sulfate Solids Trihalomethanes
17.32500374 4.33758572 0.04635208
pred.tree = predict(fit.tree, wp.test, type = "class")
re <- table(pred.tree, wp.test$Potability)
co_re <- confusionMatrix(re)
print(co_re)
Confusion Matrix and Statistics
pred.tree Not Potable Potable
Not Potable 285 185
Potable 26 31
Accuracy : 0.5996
95% CI : (0.5564, 0.6417)
No Information Rate : 0.5901
P-Value [Acc > NIR] : 0.3459
Kappa : 0.0675
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.9164
Specificity : 0.1435
Pos Pred Value : 0.6064
Neg Pred Value : 0.5439
Prevalence : 0.5901
Detection Rate : 0.5408
Detection Prevalence : 0.8918
Balanced Accuracy : 0.5300
'Positive' Class : Not Potable
as.matrix(co_re, what = "classes")
[,1]
Sensitivity 0.9163987
Specificity 0.1435185
Pos Pred Value 0.6063830
Neg Pred Value 0.5438596
Precision 0.6063830
Recall 0.9163987
F1 0.7298335
Prevalence 0.5901328
Detection Rate 0.5407970
Detection Prevalence 0.8918406
Balanced Accuracy 0.5299586
acc <- co_re$overall["Accuracy"]
acc*100
Accuracy
59.96205
plotcp(fit.tree)
printcp(fit.tree)
Classification tree:
rpart(formula = Potability ~ ., data = wp, method = "class",
cp = 0.008)
Variables actually used in tree construction:
[1] Solids Sulfate
Root node error: 684/1750 = 0.39086
n= 1750
CP nsplit rel error xerror xstd
1 0.013889 0 1.00000 1.00000 0.029842
2 0.008000 3 0.95322 0.97368 0.029695
# Explicitly request the lowest cp value
fit.tree$cptable[which.min(fit.tree$cptable[,"xerror"]),"CP"]
[1] 0.008
bestcp <-fit.tree$cptable[which.min(fit.tree$cptable[,"xerror"]),"CP"]
pruned.tree <- prune(fit.tree, cp = bestcp)
rpart.plot(pruned.tree)
pred.prune = predict(pruned.tree, wp.test, type="class")
re <- table(pred.prune, wp.test$Potability)
co_re <- confusionMatrix(re)
print(co_re)
Confusion Matrix and Statistics
pred.prune Not Potable Potable
Not Potable 285 185
Potable 26 31
Accuracy : 0.5996
95% CI : (0.5564, 0.6417)
No Information Rate : 0.5901
P-Value [Acc > NIR] : 0.3459
Kappa : 0.0675
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.9164
Specificity : 0.1435
Pos Pred Value : 0.6064
Neg Pred Value : 0.5439
Prevalence : 0.5901
Detection Rate : 0.5408
Detection Prevalence : 0.8918
Balanced Accuracy : 0.5300
'Positive' Class : Not Potable
as.matrix(co_re, what = "classes")
[,1]
Sensitivity 0.9163987
Specificity 0.1435185
Pos Pred Value 0.6063830
Neg Pred Value 0.5438596
Precision 0.6063830
Recall 0.9163987
F1 0.7298335
Prevalence 0.5901328
Detection Rate 0.5407970
Detection Prevalence 0.8918406
Balanced Accuracy 0.5299586
acc <- co_re$overall["Accuracy"]
acc*100
Accuracy
59.96205
pred.tree_raw <- predict(fit.tree, wp.test)
# Convert to probabilities
pred.tree_probs <- exp(pred.tree_raw) / (1 + exp(pred.tree_raw))
# Extract probabilities for the "Potable" class
roc_curve <- roc(ifelse(wp.test$Potability == "Potable", 1, 0), pred.tree_probs[, "Potable"])
Setting levels: control = 0, case = 1
Setting direction: controls < cases
plot(roc_curve, main = "ROC Curve", col = "blue", lwd = 2)
abline(a = 0, b = 1, col = "gray", lty = 2)
# Print AUC
cat("AUC:", auc(roc_curve), "\n")
AUC: 0.5769769
Splitting the data set into two subsets: Training(80%) and Testing(20%):
set.seed(1958)
train = sample(2, nrow(wp), replace=TRUE, prob=c(0.8, 0.2))
wp.train=wp[train == 1,]
wp.test=wp[train == 2,]
fit.tree = rpart(Potability ~ ., data=wp.train, method = "class", cp=0.008)
fit.tree
n= 1393
node), split, n, loss, yval, (yprob)
* denotes terminal node
1) root 1393 527 Not Potable (0.6216798 0.3783202)
2) Sulfate=[308,352) 651 205 Not Potable (0.6850998 0.3149002) *
3) Sulfate=[220,264),[264,308),[352,396),[396,440) 742 322 Not Potable (0.5660377 0.4339623)
6) Solids< 0.6416071 598 240 Not Potable (0.5986622 0.4013378)
12) Sulfate=[264,308),[352,396) 519 197 Not Potable (0.6204239 0.3795761) *
13) Sulfate=[220,264),[396,440) 79 36 Potable (0.4556962 0.5443038)
26) Trihalomethanes=[20,30),[30,40),[80,90),[90,100) 18 6 Not Potable (0.6666667 0.3333333) *
27) Trihalomethanes=[40,50),[50,60),[60,70),[70,80),[100,110) 61 24 Potable (0.3934426 0.6065574)
54) Solids< 0.2515644 11 3 Not Potable (0.7272727 0.2727273) *
55) Solids>=0.2515644 50 16 Potable (0.3200000 0.6800000) *
7) Solids>=0.6416071 144 62 Potable (0.4305556 0.5694444)
14) Sulfate=[352,396),[396,440) 62 24 Not Potable (0.6129032 0.3870968)
28) Trihalomethanes=[30,40),[40,50),[60,70),[70,80) 45 13 Not Potable (0.7111111 0.2888889) *
29) Trihalomethanes=[50,60),[80,90),[100,110) 17 6 Potable (0.3529412 0.6470588) *
15) Sulfate=[220,264),[264,308) 82 24 Potable (0.2926829 0.7073171)
30) ph=[3,7) 32 16 Not Potable (0.5000000 0.5000000)
60) Chloramines=[7,11) 15 3 Not Potable (0.8000000 0.2000000) *
61) Chloramines=[3,7) 17 4 Potable (0.2352941 0.7647059) *
31) ph=[7,11) 50 8 Potable (0.1600000 0.8400000) *
rpart.plot(fit.tree)
fit.tree$variable.importance
Sulfate Solids Trihalomethanes Chloramines ph Hardness Organic_carbon
21.4704559 11.1997162 8.3888087 5.0823529 4.5112195 0.9695811 0.9263146
Conductivity
0.5397097
pred.tree = predict(fit.tree, wp.test, type = "class")
re <- table(pred.tree, wp.test$Potability)
co_re <- confusionMatrix(re)
print(co_re)
Confusion Matrix and Statistics
pred.tree Not Potable Potable
Not Potable 184 138
Potable 16 19
Accuracy : 0.5686
95% CI : (0.5155, 0.6206)
No Information Rate : 0.5602
P-Value [Acc > NIR] : 0.3957
Kappa : 0.0448
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.9200
Specificity : 0.1210
Pos Pred Value : 0.5714
Neg Pred Value : 0.5429
Prevalence : 0.5602
Detection Rate : 0.5154
Detection Prevalence : 0.9020
Balanced Accuracy : 0.5205
'Positive' Class : Not Potable
as.matrix(co_re, what = "classes")
[,1]
Sensitivity 0.9200000
Specificity 0.1210191
Pos Pred Value 0.5714286
Neg Pred Value 0.5428571
Precision 0.5714286
Recall 0.9200000
F1 0.7049808
Prevalence 0.5602241
Detection Rate 0.5154062
Detection Prevalence 0.9019608
Balanced Accuracy 0.5205096
acc <- co_re$overall["Accuracy"]
acc*100
Accuracy
56.86275
plotcp(fit.tree)
printcp(fit.tree)
Classification tree:
rpart(formula = Potability ~ ., data = wp.train, method = "class",
cp = 0.008)
Variables actually used in tree construction:
[1] Chloramines ph Solids Sulfate Trihalomethanes
Root node error: 527/1393 = 0.37832
n= 1393
CP nsplit rel error xerror xstd
1 0.0189753 0 1.00000 1.00000 0.034346
2 0.0132827 3 0.93548 0.97913 0.034201
3 0.0113852 4 0.92220 0.98102 0.034215
4 0.0094877 5 0.91082 0.99810 0.034333
5 0.0085389 7 0.89184 0.99051 0.034281
6 0.0080000 9 0.87476 0.98102 0.034215
# Explicitly request the lowest cp value
fit.tree$cptable[which.min(fit.tree$cptable[,"xerror"]),"CP"]
[1] 0.01328273
bestcp <-fit.tree$cptable[which.min(fit.tree$cptable[,"xerror"]),"CP"]
pruned.tree <- prune(fit.tree, cp = bestcp)
rpart.plot(pruned.tree)
pred.prune = predict(pruned.tree, wp.test, type="class")
re <- table(pred.prune, wp.test$Potability)
co_re <- confusionMatrix(re)
print(co_re)
Confusion Matrix and Statistics
pred.prune Not Potable Potable
Not Potable 190 141
Potable 10 16
Accuracy : 0.577
95% CI : (0.5239, 0.6289)
No Information Rate : 0.5602
P-Value [Acc > NIR] : 0.2793
Kappa : 0.057
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.9500
Specificity : 0.1019
Pos Pred Value : 0.5740
Neg Pred Value : 0.6154
Prevalence : 0.5602
Detection Rate : 0.5322
Detection Prevalence : 0.9272
Balanced Accuracy : 0.5260
'Positive' Class : Not Potable
as.matrix(co_re, what = "classes")
[,1]
Sensitivity 0.9500000
Specificity 0.1019108
Pos Pred Value 0.5740181
Neg Pred Value 0.6153846
Precision 0.5740181
Recall 0.9500000
F1 0.7156309
Prevalence 0.5602241
Detection Rate 0.5322129
Detection Prevalence 0.9271709
Balanced Accuracy 0.5259554
acc <- co_re$overall["Accuracy"]
acc*100
Accuracy
57.70308
pred.tree_raw <- predict(fit.tree, wp.test)
pred.tree_probs <- exp(pred.tree_raw) / (1 + exp(pred.tree_raw))
roc_curve <- roc(ifelse(wp.test$Potability == "Potable", 1, 0), pred.tree_probs[, "Potable"])
Setting levels: control = 0, case = 1
Setting direction: controls < cases
plot(roc_curve, main = "ROC Curve", col = "blue", lwd = 2)
abline(a = 0, b = 1, col = "gray", lty = 2)
# Print AUC
cat("AUC:", auc(roc_curve), "\n")
AUC: 0.5796656
Splitting the data set into two subsets: Training(90%) and Testing(10%):
set.seed(1958)
train = sample(2, nrow(wp), replace=TRUE, prob=c(0.9, 0.1))
wp.train=wp[train == 1,]
wp.test=wp[train == 2,]
fit.tree = rpart(Potability ~ ., data=wp.train, method = "class", cp=0.008)
fit.tree
n= 1573
node), split, n, loss, yval, (yprob)
* denotes terminal node
1) root 1573 611 Not Potable (0.6115702 0.3884298)
2) Sulfate=[308,352) 733 236 Not Potable (0.6780355 0.3219645) *
3) Sulfate=[220,264),[264,308),[352,396),[396,440) 840 375 Not Potable (0.5535714 0.4464286)
6) Solids< 0.6665636 704 294 Not Potable (0.5823864 0.4176136)
12) Sulfate=[264,308),[352,396) 618 245 Not Potable (0.6035599 0.3964401) *
13) Sulfate=[220,264),[396,440) 86 37 Potable (0.4302326 0.5697674)
26) Trihalomethanes=[20,30),[30,40),[80,90),[90,100) 18 6 Not Potable (0.6666667 0.3333333) *
27) Trihalomethanes=[40,50),[50,60),[60,70),[70,80),[100,110) 68 25 Potable (0.3676471 0.6323529) *
7) Solids>=0.6665636 136 55 Potable (0.4044118 0.5955882)
14) Sulfate=[352,396),[396,440) 54 20 Not Potable (0.6296296 0.3703704) *
15) Sulfate=[220,264),[264,308) 82 21 Potable (0.2560976 0.7439024) *
rpart.plot(fit.tree)
fit.tree$variable.importance
Sulfate Solids Trihalomethanes Hardness Organic_carbon
25.74920220 7.43857110 3.83872439 0.06618024 0.06618024
pred.tree = predict(fit.tree, wp.test, type = "class")
re <- table(pred.tree, wp.test$Potability)
co_re <- confusionMatrix(re)
print(co_re)
Confusion Matrix and Statistics
pred.tree Not Potable Potable
Not Potable 92 65
Potable 12 8
Accuracy : 0.565
95% CI : (0.4885, 0.6392)
No Information Rate : 0.5876
P-Value [Acc > NIR] : 0.7547
Kappa : -0.0065
Mcnemar's Test P-Value : 3.105e-09
Sensitivity : 0.8846
Specificity : 0.1096
Pos Pred Value : 0.5860
Neg Pred Value : 0.4000
Prevalence : 0.5876
Detection Rate : 0.5198
Detection Prevalence : 0.8870
Balanced Accuracy : 0.4971
'Positive' Class : Not Potable
as.matrix(co_re, what = "classes")
[,1]
Sensitivity 0.8846154
Specificity 0.1095890
Pos Pred Value 0.5859873
Neg Pred Value 0.4000000
Precision 0.5859873
Recall 0.8846154
F1 0.7049808
Prevalence 0.5875706
Detection Rate 0.5197740
Detection Prevalence 0.8870056
Balanced Accuracy 0.4971022
acc <- co_re$overall["Accuracy"]
acc*100
Accuracy
56.49718
plotcp(fit.tree)
printcp(fit.tree)
Classification tree:
rpart(formula = Potability ~ ., data = wp.train, method = "class",
cp = 0.008)
Variables actually used in tree construction:
[1] Solids Sulfate Trihalomethanes
Root node error: 611/1573 = 0.38843
n= 1573
CP nsplit rel error xerror xstd
1 0.021277 0 1.00000 1.00000 0.031638
2 0.019640 3 0.93453 1.04092 0.031856
3 0.009820 4 0.91489 1.00164 0.031647
4 0.008000 5 0.90507 0.99345 0.031599
# Explicitly request the lowest cp value
fit.tree$cptable[which.min(fit.tree$cptable[,"xerror"]),"CP"]
[1] 0.008
bestcp <-fit.tree$cptable[which.min(fit.tree$cptable[,"xerror"]),"CP"]
pruned.tree <- prune(fit.tree, cp = bestcp)
rpart.plot(pruned.tree)
pred.prune = predict(pruned.tree, wp.test, type="class")
re <- table(pred.prune, wp.test$Potability)
co_re <- confusionMatrix(re)
print(co_re)
Confusion Matrix and Statistics
pred.prune Not Potable Potable
Not Potable 92 65
Potable 12 8
Accuracy : 0.565
95% CI : (0.4885, 0.6392)
No Information Rate : 0.5876
P-Value [Acc > NIR] : 0.7547
Kappa : -0.0065
Mcnemar's Test P-Value : 3.105e-09
Sensitivity : 0.8846
Specificity : 0.1096
Pos Pred Value : 0.5860
Neg Pred Value : 0.4000
Prevalence : 0.5876
Detection Rate : 0.5198
Detection Prevalence : 0.8870
Balanced Accuracy : 0.4971
'Positive' Class : Not Potable
as.matrix(co_re, what = "classes")
[,1]
Sensitivity 0.8846154
Specificity 0.1095890
Pos Pred Value 0.5859873
Neg Pred Value 0.4000000
Precision 0.5859873
Recall 0.8846154
F1 0.7049808
Prevalence 0.5875706
Detection Rate 0.5197740
Detection Prevalence 0.8870056
Balanced Accuracy 0.4971022
acc <- co_re$overall["Accuracy"]
acc*100
Accuracy
56.49718
pred.tree_raw <- predict(fit.tree, wp.test)
pred.tree_probs <- exp(pred.tree_raw) / (1 + exp(pred.tree_raw))
roc_curve <- roc(ifelse(wp.test$Potability == "Potable", 1, 0), pred.tree_probs[, "Potable"])
Setting levels: control = 0, case = 1
Setting direction: controls < cases
plot(roc_curve, main = "ROC Curve", col = "blue", lwd = 2)
abline(a = 0, b = 1, col = "gray", lty = 2)
# Print AUC
cat("AUC:", auc(roc_curve), "\n")
AUC: 0.551304
view data
View(water_potability)
#data("water_potability")
summary(water_potability)
ph Hardness Solids Chloramines Sulfate Conductivity
Min. : 3.388 Min. :121.0 Min. : 320.9 Min. : 3.352 Min. :237.5 Min. :201.6
1st Qu.: 6.125 1st Qu.:177.8 1st Qu.:15465.4 1st Qu.: 6.189 1st Qu.:309.7 1st Qu.:366.4
Median : 7.026 Median :197.3 Median :20468.8 Median : 7.135 Median :333.0 Median :422.4
Mean : 7.075 Mean :196.0 Mean :21362.1 Mean : 7.131 Mean :333.9 Mean :426.0
3rd Qu.: 7.986 3rd Qu.:215.0 3rd Qu.:26588.0 3rd Qu.: 8.062 3rd Qu.:357.8 3rd Qu.:482.2
Max. :10.905 Max. :272.1 Max. :43195.5 Max. :10.897 Max. :429.8 Max. :652.5
Organic_carbon Trihalomethanes Turbidity Potability
Min. : 5.512 Min. : 24.53 Min. :1.873 Not Potable:1066
1st Qu.:12.233 1st Qu.: 55.96 1st Qu.:3.443 Potable : 684
Median :14.353 Median : 66.33 Median :3.974
Mean :14.434 Mean : 66.42 Mean :3.972
3rd Qu.:16.797 3rd Qu.: 77.34 3rd Qu.:4.512
Max. :23.604 Max. :108.85 Max. :6.084
str(water_potability)
'data.frame': 1750 obs. of 10 variables:
$ ph : num 8.32 9.09 5.58 10.22 8.64 ...
$ Hardness : num 214 181 188 248 203 ...
$ Solids : num 22018 17979 28749 28750 13672 ...
$ Chloramines : num 8.06 6.55 7.54 7.51 4.56 ...
$ Sulfate : num 357 310 327 394 303 ...
$ Conductivity : num 363 398 280 284 475 ...
$ Organic_carbon : num 18.4 11.6 8.4 13.8 12.4 ...
$ Trihalomethanes: num 100.3 32 54.9 84.6 62.8 ...
$ Turbidity : num 4.63 4.08 2.56 2.67 4.4 ...
$ Potability : Factor w/ 2 levels "Not Potable",..: 1 1 1 1 1 1 1 1 1 1 ...
- attr(*, "na.action")= 'omit' Named int [1:1265] 1 2 3 9 12 14 15 17 19 21 ...
..- attr(*, "names")= chr [1:1265] "1" "2" "3" "9" ...
Scale data
#first:
#Confirm that all the columns you are trying to scale are indeed numeric. You can use sapply() to check and coerce them to numeric if necessary.
water_potability<- sapply(water_potability, as.numeric)
#sinec all coulme are numeric we wll scale all of them expet class label
data_for_cluster <- scale(water_potability[, !colnames(water_potability) %in% "Potability"])
#we use !colnames(water_potability) %in% "Potability" to exclude the "Potability" column
View(data_for_cluster)
Clustring1
K-means
# 3- run k-means clustering to find 3 clusters
#set a seed for random number generation to make the results reproducible
set.seed(8953)
kmeans.result <- kmeans(data_for_cluster,2)
# print the clusterng result
kmeans.result
K-means clustering with 2 clusters of sizes 837, 913
Cluster means:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity
1 -0.5572534 -0.5126443 0.4503724 0.05290412 -0.2277059 -0.01213702 0.005065769 -0.02581942 0.1808420
2 0.5108665 0.4699707 -0.4128825 -0.04850028 0.2087512 0.01112671 -0.004644084 0.02367016 -0.1657883
Clustering vector:
[1] 2 2 1 2 2 1 1 1 2 1 1 1 1 2 2 1 1 2 1 2 2 2 2 1 2 2 2 1 1 1 2 1 1 1 1 2 2 2 1 2 2 2 2 2 1 2 1 2 1 2 2 2 1
[54] 2 2 2 1 2 2 1 2 2 2 2 1 1 1 1 1 1 1 2 2 2 1 2 2 1 2 1 1 1 2 2 2 1 1 2 1 2 2 1 2 1 1 1 1 1 1 1 2 2 1 1 2 1
[107] 2 2 1 2 2 1 1 1 2 2 1 1 2 2 2 2 1 1 2 1 1 2 2 2 2 2 2 2 2 1 1 2 2 2 1 2 1 2 2 1 1 1 1 2 1 1 1 1 1 2 2 2 1
[160] 1 1 2 2 1 2 1 1 2 2 1 1 2 2 1 2 1 1 2 2 2 2 1 2 1 1 2 2 1 2 1 1 2 1 1 2 2 2 2 2 2 1 1 2 1 2 2 1 1 1 1 1 1
[213] 2 1 2 2 2 2 1 1 2 2 2 2 2 1 2 1 1 2 1 2 2 2 2 1 1 2 2 2 2 2 1 2 2 1 1 1 2 1 2 2 1 1 2 2 1 1 2 1 1 2 1 1 2
[266] 1 1 1 1 1 2 2 2 1 1 1 1 2 1 2 2 1 2 1 1 1 1 2 2 1 2 2 1 2 2 2 1 1 1 1 2 1 1 2 2 2 1 2 1 2 2 2 2 1 2 2 1 1
[319] 2 2 2 1 2 2 1 1 1 1 1 1 2 2 1 2 1 1 2 1 2 1 2 1 1 2 2 1 2 2 2 1 2 2 1 2 2 2 1 2 2 1 2 2 2 2 1 2 2 1 1 1 1
[372] 1 1 1 1 2 2 1 2 2 1 1 2 2 2 2 2 2 2 2 2 2 1 1 2 2 2 2 2 2 2 2 2 1 1 2 2 1 1 2 2 1 1 1 2 2 2 2 1 2 2 2 1 2
[425] 2 1 2 2 1 1 1 1 1 1 2 2 1 1 2 2 1 2 1 2 1 2 2 2 1 1 1 2 2 2 2 2 2 2 1 1 2 1 1 1 1 1 2 1 2 2 1 2 1 1 2 1 1
[478] 1 2 2 2 1 1 2 1 2 2 1 2 1 2 1 2 2 1 1 1 2 1 2 1 2 2 2 2 2 1 1 2 1 2 1 2 1 1 2 2 2 2 2 1 1 2 1 1 2 2 2 2 2
[531] 2 2 1 2 1 2 1 2 1 2 1 1 2 1 2 2 2 1 2 2 1 1 2 1 1 2 2 2 2 1 1 2 1 2 2 1 1 1 2 2 2 1 2 2 2 1 2 2 2 2 2 2 2
[584] 2 2 2 1 1 2 2 2 2 2 2 2 1 2 1 2 2 1 2 1 2 1 1 2 1 2 2 2 1 1 1 2 1 2 1 2 1 2 1 2 1 1 2 2 1 2 2 1 2 1 1 1 2
[637] 1 2 1 1 1 1 2 1 1 1 1 2 1 2 2 2 1 1 1 1 1 1 1 1 2 1 1 2 2 1 1 2 1 1 1 1 1 2 1 2 1 2 1 2 1 2 1 2 1 2 2 1 1
[690] 1 1 1 2 2 1 2 1 1 1 1 1 1 1 2 1 2 1 2 2 1 1 2 2 2 2 2 2 2 1 2 1 2 1 2 1 1 2 2 2 2 1 1 2 2 1 1 1 1 2 2 2 2
[743] 1 1 2 1 2 1 2 1 1 1 2 2 2 1 2 1 2 2 1 2 1 2 1 2 2 2 2 2 2 2 2 1 2 1 2 1 2 1 2 1 2 2 2 2 2 2 1 2 1 1 1 2 2
[796] 1 1 1 2 2 1 1 2 2 1 1 2 2 1 1 2 2 1 1 1 1 2 2 2 2 2 2 1 1 2 2 1 1 1 2 2 1 2 1 1 1 2 2 2 1 1 1 1 2 2 2 1 2
[849] 2 1 2 2 1 1 1 2 2 1 2 1 1 2 2 1 2 1 2 1 1 2 2 2 2 2 2 2 1 1 2 1 2 1 2 2 2 2 2 2 2 2 1 1 2 1 2 2 1 2 2 2 2
[902] 1 1 1 2 2 2 1 2 1 1 2 1 2 1 2 2 2 2 2 1 1 2 1 2 2 1 2 2 2 2 1 2 2 1 1 1 2 1 2 2 1 2 1 1 2 2 2 2 2 2 2 1 2
[955] 1 2 1 2 2 1 1 2 2 1 1 1 1 1 2 2 1 2 1 2 2 1 2 1 2 2 2 2 2 1 2 1 2 1 2 2 2 1 2 1 1 2 1 2 2 1
[ reached getOption("max.print") -- omitted 750 entries ]
Within cluster sum of squares by cluster:
[1] 6888.207 7466.072
(between_SS / total_SS = 8.8 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss" "betweenss" "size"
[8] "iter" "ifault"
# visualize clustering (2 clusters)
#install.packages("factoextra")
library(factoextra)
Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_cluster(kmeans.result, data = data_for_cluster)
# draw a sample of 50 records from the data, so that the clustering plot will not be over crowded and easy to undrestand
idx<-sample(1:dim(data_for_cluster)[1], 50)
sample_c1<-data_for_cluster[idx, ]
## hiercrchical clustering
hc.cut<- hcut(sample_c1, k = 2, hc_method= "complete")
# Visualize dendrogram
fviz_dend(hc.cut,rect= TRUE)
Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as of ggplot2 3.3.4.
# Visualize cluster
fviz_cluster(hc.cut, ellipse.type= "convex")
#average silhouette for each clusters
#install.packages(cluster)
library(cluster)
avg_sil <- silhouette(kmeans.result$cluster,dist(data_for_cluster)) #a dissimilarity object inheriting from class dist or coercible to one. If not specified, dmatrix must be.
fviz_silhouette(avg_sil)#k-means clustering with estimating k and initializations
cluster_assignments <- c(kmeans.result$cluster)
ground_truth_labels <- c(water_potability)
data <- data.frame(cluster = cluster_assignments, label = ground_truth_labels)
# Function to calculate BCubed precision and recall
calculate_bcubed_metrics <- function(data) {
n <- nrow(data)
precision_sum <- 0
recall_sum <- 0
for (i in 1:n) {
cluster <- data$cluster[i]
label <- data$label[i]
# Count the number of items from the same category within the same cluster
same_category_same_cluster <- sum(data$label[data$cluster == cluster] == label)
# Count the total number of items in the same cluster
total_same_cluster <- sum(data$cluster == cluster)
# Count the total number of items with the same category
total_same_category <- sum(data$label == label)
# Calculate precision and recall for the current item and add them to the sums
precision_sum <- precision_sum + same_category_same_cluster /total_same_cluster
recall_sum <- recall_sum + same_category_same_cluster / total_same_category
}
# Calculate average precision and recall
precision <- precision_sum / n
recall <- recall_sum / n
return(list(precision = precision, recall = recall))
}
# Calculate BCubed precision and recall
metrics <- calculate_bcubed_metrics(data)
# Extract precision and recall from the metrics
precision <- metrics$precision
recall <- metrics$recall
# Print the results
cat("BCubed Precision:", precision, "\n")
BCubed Precision: 0.005341313
cat("BCubed Recall:", recall, "\n")
BCubed Recall: 0.9500965
Clustring2
K-means
# 3- run k-means clustering to find 3 clusters
#set a seed for random number generation to make the results reproducible
set.seed(8953)
kmeans.result <- kmeans(data_for_cluster,3)
# print the clusterng result
kmeans.result
K-means clustering with 3 clusters of sizes 552, 605, 593
Cluster means:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes
1 -0.2519979 -0.5998097 -0.3779584 0.24115308 0.74974815 -0.31275082 -0.2069277 0.11328128
2 0.6578986 0.6735686 -0.4294630 -0.15789451 -0.05599083 0.24575664 0.0751758 0.02595442
3 -0.4366371 -0.1288601 0.7899800 -0.06339008 -0.64078672 0.04039745 0.1159236 -0.13192865
Turbidity
1 0.1748770
2 -0.3158780
3 0.1594841
Clustering vector:
[1] 2 2 3 2 2 3 3 3 2 3 1 1 3 1 2 3 3 2 1 1 2 2 2 3 2 2 2 1 3 3 2 3 3 3 3 1 2 2 3 3 2 2 2 2 3 1 3 2 3 2 2 2 1
[54] 2 2 2 1 2 2 3 2 2 1 2 3 3 3 3 3 1 1 2 2 2 3 2 2 1 1 3 3 1 2 2 2 1 1 1 2 3 2 3 2 3 1 3 3 1 3 1 1 2 2 3 2 3
[107] 3 2 2 2 2 1 1 3 2 2 3 2 2 2 2 2 3 3 2 3 1 2 2 2 1 2 2 1 1 1 3 2 3 2 3 2 1 2 1 3 3 3 3 2 1 3 3 3 3 2 1 2 1
[160] 3 3 1 2 3 3 3 1 2 1 3 1 2 2 3 1 1 3 1 1 1 2 1 2 1 1 2 3 3 1 3 3 2 3 1 1 2 2 2 2 2 3 3 1 3 2 2 1 3 3 3 3 3
[213] 2 3 2 2 2 1 3 1 2 1 2 2 2 3 2 3 1 2 3 2 2 2 1 3 1 2 2 1 1 1 3 2 2 3 3 3 2 3 2 1 3 3 1 1 3 3 2 3 3 2 3 1 1
[266] 3 3 1 1 3 2 2 1 3 3 3 1 2 3 2 1 3 2 3 3 3 1 1 2 3 2 2 3 2 2 2 3 1 3 1 2 1 1 1 2 1 3 2 3 2 2 3 3 3 2 2 1 3
[319] 2 2 2 1 1 2 3 3 3 3 3 3 3 2 3 2 1 1 1 3 2 3 1 3 1 2 2 1 3 2 2 3 2 2 1 2 2 2 1 1 3 1 2 2 2 2 3 2 2 1 3 3 1
[372] 1 1 3 3 2 3 3 2 2 3 3 2 2 2 2 2 2 2 1 1 2 3 1 2 2 3 1 3 2 2 2 2 1 3 2 2 3 3 1 2 1 3 3 2 2 2 2 1 2 2 2 3 2
[425] 2 3 2 2 3 3 3 3 1 1 2 2 3 3 2 2 3 2 3 2 3 2 2 2 1 3 3 1 2 2 2 2 2 3 1 3 2 1 1 3 3 3 2 3 2 2 3 1 3 1 2 1 3
[478] 3 2 1 1 1 3 2 1 1 2 3 3 3 1 3 2 2 3 3 1 2 3 2 3 2 2 1 2 3 3 3 2 3 2 3 2 1 3 1 2 2 2 2 3 3 1 3 1 2 2 2 2 2
[531] 2 2 1 2 1 2 3 2 3 2 3 1 2 3 2 2 2 3 2 2 3 1 2 3 3 2 1 2 1 3 1 2 3 2 3 3 3 3 2 2 2 1 2 2 2 1 1 1 1 3 2 1 2
[584] 1 1 1 3 3 1 1 3 2 2 1 2 3 2 3 1 2 3 1 3 1 3 3 1 3 1 2 1 1 3 3 1 1 2 3 1 3 1 3 1 3 3 1 2 3 3 2 3 2 3 3 3 1
[637] 3 2 3 3 3 1 1 1 3 3 3 2 3 1 1 2 1 3 3 3 1 3 1 3 2 3 2 2 1 3 3 2 3 3 1 3 1 2 3 1 3 1 3 1 3 2 3 1 3 2 2 3 3
[690] 1 3 1 2 1 3 2 1 3 1 3 3 3 1 2 3 1 3 1 2 3 3 1 1 1 1 2 2 1 3 1 3 1 3 2 1 3 1 2 2 2 3 3 2 1 3 3 3 3 3 2 2 1
[743] 3 3 1 1 2 1 2 1 3 3 2 1 1 3 2 3 2 2 3 3 3 1 3 1 2 2 1 2 1 1 1 3 2 1 1 1 2 3 1 3 2 2 2 2 1 1 3 1 3 3 3 3 3
[796] 3 3 3 2 1 1 3 1 2 1 1 2 2 1 3 2 1 1 3 3 3 2 2 1 1 3 1 1 3 2 1 3 1 3 2 2 3 1 3 3 3 2 2 2 3 3 3 1 1 1 1 3 1
[849] 1 3 2 2 1 1 3 2 1 3 2 3 3 2 1 3 2 1 1 3 1 1 1 3 2 1 2 2 3 3 2 3 1 1 3 1 3 3 2 2 2 2 1 3 1 1 2 1 1 3 2 2 2
[902] 1 3 1 2 3 2 1 2 3 3 1 1 1 1 2 2 1 2 1 3 1 2 3 1 2 1 1 2 3 2 1 2 2 3 3 3 2 3 2 2 3 2 3 3 1 2 2 2 2 2 1 1 3
[955] 3 2 1 1 2 1 1 1 1 3 1 1 1 3 2 2 3 1 3 1 3 1 1 1 3 1 2 2 2 1 2 1 2 1 1 2 2 1 2 3 1 2 3 3 2 1
[ reached getOption("max.print") -- omitted 750 entries ]
Within cluster sum of squares by cluster:
[1] 4164.678 4700.218 4580.780
(between_SS / total_SS = 14.6 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss" "betweenss" "size"
[8] "iter" "ifault"
# visualize clustering (2 clusters)
#install.packages("factoextra")
library(factoextra)
fviz_cluster(kmeans.result, data = data_for_cluster)
# draw a sample of 50 records from the data, so that the clustering plot will not be over crowded and easy to undrestand
idx2<-sample(1:dim(data_for_cluster)[1], 50)
sample_c2<-data_for_cluster[idx2, ]
## hiercrchicalclustering
hc2.cut<- hcut(sample_c2, k = 3, hc_method= "complete")
# Visualize dendrogram
fviz_dend(hc2.cut,rect= TRUE)
# Visualize cluster
fviz_cluster(hc2.cut, ellipse.type= "convex")
#average silhouette for each clusters
#install.packages(cluster)
library(cluster)
avg_sil <- silhouette(kmeans.result$cluster,dist(data_for_cluster)) #a dissimilarity object inheriting from class dist or coercible to one. If not specified, dmatrix must be.
fviz_silhouette(avg_sil)#k-means clustering with estimating k and initializations
cluster_assignments <- c(kmeans.result$cluster)
ground_truth_labels <- c(water_potability)
data <- data.frame(cluster = cluster_assignments, label = ground_truth_labels)
# Function to calculate BCubed precision and recall
calculate_bcubed_metrics <- function(data) {
n <- nrow(data)
precision_sum <- 0
recall_sum <- 0
for (i in 1:n) {
cluster <- data$cluster[i]
label <- data$label[i]
# Count the number of items from the same category within the same cluster
same_category_same_cluster <- sum(data$label[data$cluster == cluster] == label)
# Count the total number of items in the same cluster
total_same_cluster <- sum(data$cluster == cluster)
# Count the total number of items with the same category
total_same_category <- sum(data$label == label)
# Calculate precision and recall for the current item and add them to the sums
precision_sum <- precision_sum + same_category_same_cluster /total_same_cluster
recall_sum <- recall_sum + same_category_same_cluster / total_same_category
}
# Calculate average precision and recall
precision <- precision_sum / n
recall <- recall_sum / n
return(list(precision = precision, recall = recall))
}
# Calculate BCubed precision and recall
metrics <- calculate_bcubed_metrics(data)
# Extract precision and recall from the metrics
precision <- metrics$precision
recall <- metrics$recall
# Print the results
cat("BCubed Precision:", precision, "\n")
BCubed Precision: 0.005399565
cat("BCubed Recall:", recall, "\n")
BCubed Recall: 0.9334336
Clustring3
K-means
# 3- run k-means clustering to find 3 clusters
#set a seed for random number generation to make the results reproducible
set.seed(8953)
kmeans.result <- kmeans(data_for_cluster,4)
# print the clusterng result
kmeans.result
K-means clustering with 4 clusters of sizes 455, 454, 473, 368
Cluster means:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes
1 -0.2146827 -0.7214547 -0.3677362 0.2968260 0.7101276 -0.46685877 -0.13233662 0.2631821
2 0.5278807 0.5322546 -0.2851709 -0.2861510 -0.1194541 -0.47811425 0.49623590 -0.5686409
3 -0.4902566 -0.2272040 0.9167746 -0.1139594 -0.6508868 0.06680126 -0.01034781 0.1811282
4 0.2443319 0.5274071 -0.3718664 0.1324987 0.1059608 1.08121634 -0.43528104 0.1433192
Turbidity
1 -0.22443204
2 0.01320375
3 0.26851135
4 -0.08392336
Clustering vector:
[1] 1 2 1 2 2 3 3 3 4 3 1 3 3 1 2 3 3 2 1 2 2 2 2 3 2 2 4 3 3 2 4 3 3 3 3 4 4 4 3 4 2 4 4 4 3 3 3 1 3 2 2 4 1
[54] 2 4 2 1 2 2 3 2 4 1 2 3 3 2 2 3 1 1 2 4 4 2 4 2 1 1 3 3 4 1 2 4 1 1 2 1 3 4 3 4 3 3 3 4 2 3 1 1 2 3 2 2 3
[107] 2 4 2 2 2 1 3 3 2 2 3 4 2 2 4 2 3 3 2 3 1 4 2 4 2 2 4 4 4 1 2 4 2 4 3 1 1 4 1 3 3 3 2 4 1 3 3 3 3 4 4 4 1
[160] 3 3 1 4 2 3 3 1 2 2 3 1 4 4 1 4 4 4 1 2 4 4 3 4 1 1 4 3 3 1 3 4 4 3 1 1 4 1 2 2 4 4 3 2 2 2 2 4 2 3 4 4 3
[213] 4 3 2 4 4 1 3 3 2 1 4 2 2 2 4 3 1 4 2 2 4 2 1 3 1 1 4 1 2 1 3 2 2 3 2 3 4 3 1 2 3 3 4 1 3 3 2 3 2 2 4 1 4
[266] 3 3 3 1 3 2 4 1 4 3 3 2 4 3 2 2 3 2 4 3 1 2 1 1 1 4 2 3 4 1 4 3 1 3 1 4 1 1 1 4 1 3 2 3 1 2 2 2 4 2 4 4 3
[319] 1 1 4 4 1 2 3 3 3 3 3 3 3 2 3 1 1 4 1 1 2 3 4 2 1 4 4 1 2 4 1 3 4 2 3 2 4 2 3 1 3 1 4 2 2 3 3 2 1 1 3 3 1
[372] 1 3 3 3 2 3 3 2 2 2 3 2 2 4 4 2 2 2 1 1 4 4 1 1 2 2 1 4 2 2 4 2 1 3 2 2 2 2 4 4 3 3 3 4 2 4 4 1 2 3 2 3 4
[425] 2 2 2 4 3 3 1 3 1 1 2 4 3 3 4 4 4 4 3 1 3 2 4 4 3 3 3 2 2 2 4 4 4 2 2 3 2 1 1 3 3 3 4 3 4 4 3 4 3 1 2 1 3
[478] 3 2 1 2 1 3 2 1 1 2 4 4 3 2 1 2 2 2 3 1 2 3 2 3 2 4 1 4 4 3 3 4 3 2 3 4 1 3 1 2 4 2 2 3 4 1 3 1 1 4 2 2 2
[531] 2 2 1 4 1 4 2 2 3 2 3 2 4 3 2 4 4 3 4 3 4 1 4 3 3 2 1 1 1 3 1 2 1 1 4 3 3 2 4 2 4 1 4 4 4 4 2 1 1 2 4 1 4
[584] 2 1 1 3 4 1 4 1 4 4 4 4 3 4 3 4 2 2 2 3 1 3 3 2 3 1 3 1 1 3 3 1 1 4 3 4 3 1 3 4 3 3 1 1 3 2 4 3 2 3 3 3 2
[637] 3 4 3 3 3 4 1 2 2 3 3 2 2 1 4 2 3 3 3 3 1 2 2 3 4 3 4 1 4 2 3 4 3 3 4 2 4 2 1 1 3 1 2 1 4 1 3 1 3 2 4 4 3
[690] 4 3 1 2 1 3 1 1 3 1 3 3 4 3 4 3 1 3 1 2 3 3 2 1 1 4 4 4 2 3 1 2 1 2 4 1 1 2 4 4 1 4 2 2 2 4 3 3 2 2 2 4 2
[743] 3 2 2 1 2 1 1 1 2 3 4 1 4 3 2 3 4 2 3 2 3 1 2 2 1 4 1 4 2 1 1 2 2 1 1 1 2 3 1 3 2 4 4 2 1 1 3 4 4 3 3 3 2
[796] 3 3 3 4 4 1 3 1 4 1 2 4 2 1 3 3 2 1 3 3 1 2 4 1 1 3 1 1 3 4 1 2 1 3 2 4 3 1 3 3 3 2 4 4 3 3 3 1 2 1 1 3 2
[849] 1 3 4 2 2 1 3 4 1 2 2 3 3 4 4 3 4 1 1 3 1 1 1 2 4 4 1 4 3 3 4 3 1 1 2 1 3 2 4 2 4 4 1 4 1 1 2 1 1 2 1 4 4
[902] 1 3 1 2 2 2 1 4 3 3 1 1 3 1 3 2 1 4 1 2 1 4 3 4 2 1 4 1 1 2 1 2 4 3 3 3 2 3 4 2 3 2 3 3 1 4 2 4 4 2 1 1 2
[955] 3 2 4 1 4 3 3 1 1 3 1 1 1 2 4 4 2 4 3 2 2 3 1 3 2 1 4 2 4 1 4 1 2 1 1 4 2 1 4 3 1 4 3 4 2 1
[ reached getOption("max.print") -- omitted 750 entries ]
Within cluster sum of squares by cluster:
[1] 3278.341 3196.457 3576.576 2751.181
(between_SS / total_SS = 18.7 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss" "betweenss" "size"
[8] "iter" "ifault"
# visualize clustering (2 clusters)
#install.packages("factoextra")
library(factoextra)
fviz_cluster(kmeans.result, data = data_for_cluster)
# draw a sample of 50 records from the data, so that the clustering plot will not be over crowded and easy to undrestand
idx3<-sample(1:dim(data_for_cluster)[1], 50)
sample_c3<-data_for_cluster[idx3, ]
## hiercrchicalclustering
hc3.cut<- hcut(sample_c3, k = 4, hc_method= "complete")
# Visualize dendrogram
fviz_dend(hc3.cut,rect= TRUE)
# Visualize cluster
fviz_cluster(hc3.cut, ellipse.type= "convex")
#average silhouette for each clusters
#install.packages(cluster)
library(cluster)
avg_sil <- silhouette(kmeans.result$cluster,dist(data_for_cluster)) #a dissimilarity object inheriting from class dist or coercible to one. If not specified, dmatrix must be.
fviz_silhouette(avg_sil)#k-means clustering with estimating k and initializations
NA
cluster_assignments <- c(kmeans.result$cluster)
ground_truth_labels <- c(water_potability)
data <- data.frame(cluster = cluster_assignments, label = ground_truth_labels)
# Function to calculate BCubed precision and recall
calculate_bcubed_metrics <- function(data) {
n <- nrow(data)
precision_sum <- 0
recall_sum <- 0
for (i in 1:n) {
cluster <- data$cluster[i]
label <- data$label[i]
# Count the number of items from the same category within the same cluster
same_category_same_cluster <- sum(data$label[data$cluster == cluster] == label)
# Count the total number of items in the same cluster
total_same_cluster <- sum(data$cluster == cluster)
# Count the total number of items with the same category
total_same_category <- sum(data$label == label)
# Calculate precision and recall for the current item and add them to the sums
precision_sum <- precision_sum + same_category_same_cluster /total_same_cluster
recall_sum <- recall_sum + same_category_same_cluster / total_same_category
}
# Calculate average precision and recall
precision <- precision_sum / n
recall <- recall_sum / n
return(list(precision = precision, recall = recall))
}
# Calculate BCubed precision and recall
metrics <- calculate_bcubed_metrics(data)
# Extract precision and recall from the metrics
precision <- metrics$precision
recall <- metrics$recall
# Print the results
cat("BCubed Precision:", precision, "\n")
BCubed Precision: 0.005452594
cat("BCubed Recall:", recall, "\n")
BCubed Recall: 0.9252644
Clustring4
K-means
# 3- run k-means clustering to find 3 clusters
#set a seed for random number generation to make the results reproducible
set.seed(8953)
kmeans.result <- kmeans(data_for_cluster,5)
# print the clusterng result
kmeans.result
K-means clustering with 5 clusters of sizes 312, 363, 383, 322, 370
Cluster means:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes
1 -0.2402063 -0.8869290 -0.4290398 0.45724983 1.02228899 -0.14848187 -0.22705797 -0.05503729
2 0.5144893 0.5465975 -0.2151085 -0.41557709 0.06315779 0.04059606 0.02107087 -0.98799721
3 -0.6240078 -0.2217340 1.0431533 0.07754613 -0.51231391 -0.02639079 -0.24179836 0.02394081
4 0.3629030 0.1293341 -0.3484486 -0.35866061 -0.20275280 0.81483681 -0.50711780 0.71413603
5 0.0279054 0.3286095 -0.2037364 0.25400294 -0.21723734 -0.59643191 0.86241639 0.36944187
Turbidity
1 -0.06200938
2 0.41421355
3 0.39912942
4 -0.44505488
5 -0.37992295
Clustering vector:
[1] 5 2 3 5 2 3 5 3 4 3 1 3 3 2 2 3 3 2 3 2 4 4 2 4 2 2 2 3 3 2 4 3 3 3 4 1 4 4 4 3 2 2 4 4 2 3 3 4 3 2 5 2 5
[54] 2 4 5 5 5 2 5 5 2 1 2 4 3 5 3 3 1 5 2 2 4 5 4 2 4 4 3 3 1 4 2 2 1 1 2 4 3 4 3 4 4 3 3 3 5 4 3 1 2 4 5 2 3
[107] 2 4 5 2 2 3 3 3 4 2 3 4 2 2 4 4 3 3 5 5 1 4 5 4 5 2 4 5 4 1 5 4 5 5 3 5 5 2 4 3 3 3 2 4 4 3 5 3 3 4 2 4 1
[160] 5 3 1 4 2 5 3 5 5 2 3 1 4 2 4 4 1 2 1 2 4 4 3 2 1 1 4 5 3 2 5 3 4 3 1 1 5 4 5 5 4 2 3 2 5 5 4 4 2 3 3 3 4
[213] 4 3 5 4 4 1 3 3 2 1 4 4 2 2 2 3 3 4 5 5 2 2 4 5 3 5 4 1 2 1 5 4 2 3 2 3 4 2 1 2 4 3 4 5 5 5 5 3 2 4 4 1 4
[266] 3 3 4 4 3 2 4 3 1 3 2 2 4 3 2 5 3 2 3 3 1 2 1 1 3 4 5 2 2 1 2 3 1 3 5 2 3 5 1 4 5 3 2 3 1 2 2 5 3 2 4 1 3
[319] 5 4 4 4 5 2 3 3 3 3 3 3 2 2 3 5 1 1 5 5 4 5 2 2 1 4 4 5 5 5 4 5 4 2 3 5 4 2 3 1 5 3 4 5 5 4 3 2 5 5 5 3 5
[372] 3 3 3 5 5 5 3 5 5 2 3 2 5 4 4 2 5 5 1 1 4 2 1 4 2 2 1 2 2 5 4 1 1 3 5 2 2 2 1 2 3 3 3 4 2 4 2 3 5 5 2 3 4
[425] 5 5 5 4 4 3 5 3 2 4 5 5 3 5 4 4 3 5 3 5 3 5 4 4 3 3 4 2 5 2 5 4 4 2 2 3 2 1 1 3 3 4 4 3 4 4 3 4 3 3 2 1 3
[478] 3 2 5 2 1 3 2 1 5 4 4 5 4 2 3 5 5 5 3 4 2 3 2 3 5 4 1 4 3 4 4 5 3 4 5 4 1 3 1 2 4 2 2 3 3 4 3 5 4 4 2 2 5
[531] 2 5 5 2 5 1 2 4 3 2 5 2 4 3 2 2 5 3 5 4 5 5 5 4 2 4 1 4 5 3 1 5 4 4 3 5 3 5 4 5 2 1 4 2 4 3 2 5 1 2 4 1 4
[584] 5 5 1 5 4 4 1 3 4 2 1 4 3 4 3 4 5 5 1 3 4 4 3 2 4 1 4 1 3 4 5 1 1 2 4 1 3 4 3 1 3 3 5 5 4 2 4 4 1 5 3 3 2
[637] 3 5 5 3 4 2 5 2 2 3 4 5 2 1 2 5 3 3 3 3 5 2 2 3 4 4 4 5 2 4 5 4 3 3 1 2 1 2 4 1 3 1 2 4 4 4 3 5 3 2 4 3 3
[690] 4 3 1 5 5 5 4 1 4 1 3 3 4 3 4 3 5 3 1 5 3 5 2 1 1 1 4 4 2 3 1 2 1 5 2 1 4 5 2 1 4 3 2 5 5 4 3 5 5 2 2 4 2
[743] 3 2 2 1 5 4 5 3 2 3 2 1 1 3 5 3 4 2 3 3 3 1 2 2 1 4 1 5 2 5 5 5 5 1 1 4 5 3 1 3 5 2 4 5 1 1 3 4 4 3 3 5 5
[796] 3 3 3 4 2 1 3 1 4 1 2 4 5 1 4 5 5 1 3 3 5 5 4 1 1 5 1 3 3 5 1 5 1 3 2 4 3 1 3 4 3 2 4 5 3 5 3 1 2 1 1 4 1
[849] 1 3 1 5 5 1 3 3 1 2 5 3 3 2 1 3 2 1 1 3 1 1 1 5 4 4 1 4 3 3 5 3 5 1 2 1 3 5 2 2 4 5 1 3 1 1 2 1 1 5 4 4 4
[902] 1 5 3 5 2 5 5 4 3 3 1 1 2 1 4 2 5 4 5 2 1 4 4 4 5 1 4 5 3 5 3 5 4 2 3 2 2 3 4 4 3 5 3 3 1 2 2 2 2 5 5 1 2
[955] 5 2 4 1 4 3 4 1 4 3 1 4 1 5 2 4 5 1 3 2 2 3 4 2 5 5 2 5 5 1 2 1 5 1 4 1 2 1 2 3 5 5 3 2 5 1
[ reached getOption("max.print") -- omitted 750 entries ]
Within cluster sum of squares by cluster:
[1] 2194.062 2488.613 2783.115 2372.250 2417.582
(between_SS / total_SS = 22.1 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss" "betweenss" "size"
[8] "iter" "ifault"
# visualize clustering (2 clusters)
#install.packages("factoextra")
library(factoextra)
fviz_cluster(kmeans.result, data = data_for_cluster)
# draw a sample of 50 records from the data, so that the clustering plot will not be over crowded and easy to undrestand
idx4<-sample(1:dim(data_for_cluster)[1], 50)
sample_c4<-data_for_cluster[idx4, ]
## hiercrchicalclustering
hc4.cut<- hcut(sample_c4, k = 5, hc_method= "complete")
# Visualize dendrogram
fviz_dend(hc4.cut,rect= TRUE)
# Visualize cluster
fviz_cluster(hc4.cut, ellipse.type= "convex")
#average silhouette for each clusters
#install.packages(cluster)
library(cluster)
avg_sil <- silhouette(kmeans.result$cluster,dist(data_for_cluster)) #a dissimilarity object inheriting from class dist or coercible to one. If not specified, dmatrix must be.
fviz_silhouette(avg_sil)#k-means clustering with estimating k and initializations
cluster_assignments <- c(kmeans.result$cluster)
ground_truth_labels <- c(water_potability)
data <- data.frame(cluster = cluster_assignments, label = ground_truth_labels)
# Function to calculate BCubed precision and recall
calculate_bcubed_metrics <- function(data) {
n <- nrow(data)
precision_sum <- 0
recall_sum <- 0
for (i in 1:n) {
cluster <- data$cluster[i]
label <- data$label[i]
# Count the number of items from the same category within the same cluster
same_category_same_cluster <- sum(data$label[data$cluster == cluster] == label)
# Count the total number of items in the same cluster
total_same_cluster <- sum(data$cluster == cluster)
# Count the total number of items with the same category
total_same_category <- sum(data$label == label)
# Calculate precision and recall for the current item and add them to the sums
precision_sum <- precision_sum + same_category_same_cluster /total_same_cluster
recall_sum <- recall_sum + same_category_same_cluster / total_same_category
}
# Calculate average precision and recall
precision <- precision_sum / n
recall <- recall_sum / n
return(list(precision = precision, recall = recall))
}
# Calculate BCubed precision and recall
metrics <- calculate_bcubed_metrics(data)
# Extract precision and recall from the metrics
precision <- metrics$precision
recall <- metrics$recall
# Print the results
cat("BCubed Precision:", precision, "\n")
BCubed Precision: 0.005507547
cat("BCubed Recall:", recall, "\n")
BCubed Recall: 0.9201751
# 3- Elbow method
#fviz_nbclust() with within cluster sums of squares (wss) method
#install.packages(factoextra)
library(factoextra)
fviz_nbclust(data_for_cluster, kmeans, method = "wss") +
geom_vline(xintercept = 5, linetype = 2)+
labs(subtitle = "Elbow method")
WSS is to give you an indication of how well the data can be represented by a certain number of clusters. In k-means clustering, typically choose the number of clusters (k) that minimizes this total WSS.
# Assuming you already have your data in 'my_data' and you want to try different values of 'k'
for (k in 2:5) {
kmeans_result <- kmeans(water_potability, centers = k)
total_withinss <- kmeans_result$tot.withinss
cat("Total Within-Cluster Sum of Squares for k =", k, ":", total_withinss, "\n")
}
Total Within-Cluster Sum of Squares for k = 2 : 35783784875
Total Within-Cluster Sum of Squares for k = 3 : 18190822072
Total Within-Cluster Sum of Squares for k = 4 : 10764105304
Total Within-Cluster Sum of Squares for k = 5 : 7447599638